Skip to content

Annotation Examples

Annotate with Dataset Queries

SolveBio makes it possible to annotate a field with data from any other dataset. You can run any dataset query in an expression and use it to annotate a field.

In this example, we have a list of variants in the SolveBio variant ID format. We will annotate them with information about dataset presence (where has this variant been seen?), clinical significance, population allele frequencies, and prevalence in cancer types. You can easily add your own annotation datasets by importing them into SolveBio and querying them in an expression.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from solvebio import Annotator

# In this example we will annotate a small list
# of records in real time, but you can also
# annotate entire datasets.
records = [
    {'variant': 'GRCH37-13-32905087-32905087-AAAG'},
    {'variant': 'GRCH37-7-117199644-117199647-A'},
    {'variant': 'GRCH37-17-41244429-41244429-T'},
]

fields = [
    {
        # What's the clinical significance of the variant?
        'name': 'clinical_significance',
        'data_type': 'object',
        'is_list': False,
        'expression': """
        dataset_field_values(
            'solvebio:public:/ClinVar/3.7.4-2017-01-30/Combined',
            field='clinical_significance',
            entities=[('variant', record.variant)],
            limit=1
        )
        """
    },
    {
        # How many times has COSMIC seen this variant?
        'name': 'cosmic_count',
        'data_type': 'integer',
        'is_list': False,
        'expression': """
        dataset_field_values(
            'solvebio:public:/COSMIC/1.1.0-COSMIC71/SomaticMutationsCoding-GRCh37',
            field='count',
            entities=[('variant', record.variant)],
            limit=1
        )
        """
    },
    {
        # What's the 1000 Genomes allele frequency?
        'name': '1000g_af',
        'data_type': 'double',
        'is_list': False,
        'expression': """
        dataset_field_values(
            'solvebio:public:/1000G/1.1.0-2015-01-08/1000G-GRCh37',
            field='allele_frequency',
            entities=[('variant', record.variant)],
            limit=1
        )
        """
    },
    {
        # What public datasets is this variant in?
        'name': 'beacons',
        'data_type': 'string',
        'is_list': True,
        'expression': """
        beacon(
            record.variant,
            'variant',
            visibility='public'
        )['found']
        """
    }
]

# The output of annotate() is an iterator.
for r in Annotator(fields).annotate(records):
    print r
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
require(solvebio)

# In this example we will annotate a small list
# of records in real time, but you can also
# annotate entire datasets.
records = list(
    list(variant='GRCH37-13-32905087-32905087-AAAG'),
    list(variant='GRCH37-7-117199644-117199647-A'),
    list(variant='GRCH37-17-41244429-41244429-T')
)

fields = list(
    list(
        # What's the clinical significance of the variant?
        name='clinical_significance',
        data_type='object',
        is_list=FALSE,
        expression="
        dataset_field_values(
            'solvebio:public:/ClinVar/3.7.4-2017-01-30/Combined-GRCh37',
            field='clinical_significance',
            entities=[('variant', record.variant)],
            limit=1
        )
        "
    ),
    list(
        # How many times has COSMIC seen this variant?
        name='cosmic_count',
        data_type='integer',
        is_list=FALSE,
        expression="
        dataset_field_values(
            'solvebio:public:/COSMIC/1.1.0-COSMIC71/SomaticMutationsCoding-GRCh37',
            field='count',
            entities=[('variant', record.variant)],
            limit=1
        )
        "
    ),
    list(
        # What's the 1000 Genomes allele frequency?
        name='1000g_af',
        data_type='double',
        is_list=FALSE,
        expression="
        dataset_field_values(
            'solvebio:public:/1000G/1.1.0-2015-01-08/1000G-GRCh37',
            field='allele_frequency',
            entities=[('variant', record.variant)],
            limit=1
        )
        "
    ),
    list(
        # What public datasets is this variant in?
        name='beacons',
        data_type='string',
        is_list=TRUE,
        expression="
        beacon(
            record.variant,
            'variant',
            visibility='public'
        )['found']
        "
    )
)

# Returns a data frame
Annotator.annotate(records=records, fields=fields)

Harmonize Entity Identifiers

If you have a list of variants in HGVS or VCF format, you can use an expression to normalize them for easy comparison. In the following example, all variants are actually the same, they were just processed by different bioinformatics pipelines:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from solvebio import Annotator

records = [
    {'variant': 'chr17 41244429 . C T .'},
    {'variant': 'NC_000017.10:g.41244429C>T'},
    {'variant': 'NM_007299.3:c.788-1380G>A'},
    {'variant': 'NP_009225.1:p.Ser1040Asn'},
    {'variant': 'NP_009228.2:p.Ser993Asn'},
    # NOTE: rsID variants can represent multiple alleles
    {'variant': 'rs4986852'},
]

fields = [
    {
        'name': 'variant_normalized',
        'data_type': 'string',
        'is_list': False,
        'expression': "entity_ids('variant', record.variant)"
    }
]

for r in Annotator(fields).annotate(records):
    print r['variant'], ' -> ', r['variant_normalized']

# chr17 41244429 . C T .      ->  GRCH37-17-41244429-41244429-T
# NC_000017.10:g.41244429C>T  ->  GRCH37-17-41244429-41244429-T
# NM_007299.3:c.788-1380G>A   ->  GRCH37-17-41244429-41244429-T
# NP_009225.1:p.Ser1040Asn    ->  GRCH37-17-41244429-41244429-T
# NP_009228.2:p.Ser993Asn     ->  GRCH37-17-41244429-41244429-T
# rs4986852                   ->  GRCH37-17-41244429-41244429-T
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
require(solvebio)

records = list(
    list(variant='chr17 41244429 . C T .'),
    list(variant='NC_000017.10:g.41244429C>T'),
    list(variant='NM_007299.3:c.788-1380G>A'),
    list(variant='NP_009225.1:p.Ser1040Asn'),
    list(variant='NP_009228.2:p.Ser993Asn'),
    # NOTE: rsID variants can represent multiple alleles
    list(variant='rs4986852')
)

fields = list(
    list(
        name='variant_normalized',
        data_type='string',
        is_list=FALSE,
        expression="entity_ids('variant', record.variant)"
    )
)

annotated <- Annotator.annotate(records=records, fields=fields)
# chr17 41244429 . C T .      ->  GRCH37-17-41244429-41244429-T
# NC_000017.10:g.41244429C>T  ->  GRCH37-17-41244429-41244429-T
# NM_007299.3:c.788-1380G>A   ->  GRCH37-17-41244429-41244429-T
# NP_009225.1:p.Ser1040Asn    ->  GRCH37-17-41244429-41244429-T
# NP_009228.2:p.Ser993Asn     ->  GRCH37-17-41244429-41244429-T
# rs4986852                   ->  GRCH37-17-41244429-41244429-T

Variant Effect Prediction

SolveBio provides built-in support for running variant effect prediction using Veppy, an open source tool inspired by SnpEff and VEP. You can run it in a simple expression, which returns a list of predicted effects on known transcripts:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from solvebio import Annotator

records = [
    {'variant': 'GRCH37-13-32893302-32893302-TA'},
    {'variant': 'GRCH37-17-41244429-41244429-T'},
    {'variant': 'GRCH37-7-55249071-55249071-T'},
]

fields = [
    {
        'name': 'effects',
        'data_type': 'object',
        'is_list': True,
        'expression': "predict_variant_effects(record.variant)"
    }
]

for r in Annotator(fields).annotate(records):
    print r['variant'], '->', [(eff['so_term'], eff['transcript'])
                               for eff in r['effects']]

# GRCH37-13-32893302-32893302-TA -> [(u'frameshift_variant', u'NM_000059.3'), ... ]
# GRCH37-17-41244429-41244429-T  -> [(u'missense_variant', u'NM_007297.3'), ... ]
# GRCH37-7-55249071-55249071-T   -> [(u'missense_variant', u'NM_005228.3'), ... ]
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
require(solvebio)

records = list(
    list(variant='GRCH37-13-32893302-32893302-TA'),
    list(variant='GRCH37-17-41244429-41244429-T'),
    list(variant='GRCH37-7-55249071-55249071-T')
)

fields = list(
    list(
        name='effects',
        data_type='object',
        is_list=TRUE,
        expression="predict_variant_effects(record.variant)"
    )
)

annotated <- Annotator.annotate(records=records, fields=fields)
# GRCH37-13-32893302-32893302-TA -> [(u'frameshift_variant', u'NM_000059.3'), ... ]
# GRCH37-17-41244429-41244429-T  -> [(u'missense_variant', u'NM_007297.3'), ... ]
# GRCH37-7-55249071-55249071-T   -> [(u'missense_variant', u'NM_005228.3'), ... ]

Merge a List of Entities

Sometimes you'll want to combine a list of entities (variants, genes, patients, samples) from a few different datasets. Using a dataset migrations it's possible to combine these into a unique list, ready for further annotation. This particular workflow can only be performed as a dataset migration in which records are copied from one or more source datasets into a new target dataset.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import solvebio
from solvebio import Dataset
from solvebio import Filter
solvebio.login()


def merge_variants(source, target, filters=None):
    # Get the fields containing variants and genes,
    # requires "entity_type" to be set on the fields.
    variant_field = source.fields(entity_type='variant').objects()[0]

    target_fields = [
        {
            'name': '_id',
            'expression': 'record["%s"].upper()' % variant_field.name,
            'ordering': 1,
        },
        {
            'name': 'variant',
            'entity_type': 'variant',
            'expression': 'record["_id"]',
            'ordering': 2,
        },
        {
            'name': 'sources',
            'data_type': 'object',
            'expression': '{"%s": True}' % source.full_name,
            'ordering': 3,
        },
    ]

    # Query with the optional filters, and exclude records without variants
    non_empty = ~ (Filter(**{variant_field.name: None}) &
                   Filter(**{variant_field.name: ''}))
    query = source.query(fields=[variant_field.name, '_id'])\
        .filter(filters)\
        .filter(non_empty)
    # Preview the annotation:
    # print query.annotate(fields).next()
    return query.migrate(target, target_fields=target_fields,
                         commit_mode='upsert', follow=False)


target = Dataset.get_or_create_by_full_path('<NEW TARGET DATASET>')

# Add all Pathogenic variants from ClinVar
source = Dataset.get_by_full_path('solvebio:public:/ClinVar/3.7.4-2017-01-30/Variants-GRCh37')
filters = Filter(clinical_significance='Pathogenic')
merge_variants(source, target, filters)

# Add all BRAF variants from ICGC (TCGA)
source = Dataset.get_by_full_path('solvebio:public:/ICGC/3.0.0-23/SimpleSomaticMutation-GRCh37')
filters = Filter(gene_symbol='BRAF')
merge_variants(source, target, filters)

# After a few minutes, you should see a list of variants in the target dataset.

Track Changes to a Dataset

Variant curation is a common workflow. The ability to take free-text notes and link them to "canonical entities" such as variants, makes it possible to easily create a knowledge-base that can be shared and collaborated on.

In this example, we create the framework for a simple "comments" dataset that keeps track of the person that made a change, the date/time it was made, their comment, and the canonical variant ID (automatically harmonized):

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import solvebio
from solvebio import Dataset
from solvebio import DatasetImport
solvebio.login()


fields = [
    {
        'name': 'comment',
        'data_type': 'text'
    },
    {
        'name': 'variant',
        'data_type': 'string',
        'entity_type': 'variant',
        'is_list': True,
        'expression': 'entity_ids("variant", record.variant)'
    },
    {
        'name': 'author',
        'data_type': 'object',
        'expression': 'user()',
    },
    {
        'name': 'date',
        'data_type': 'string',
        'expression': 'today()'
    },
]

dataset = Dataset.get_or_create_by_full_path('<NEW DATASET>', fields=fields)

# Add a comment
record = {
    'comment': 'This variant has been manually curated as pathogenic according to our internal guidelines.',
    'variant': 'NP_009228.2:p.Ser993Asn'
}

imp = DatasetImport.create(
    dataset_id=dataset.id,
    data_records=[record],
    target_fields=dataset.template()['fields']
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
require(solvebio)


fields = list(
    list(
        name='comment',
        data_type='text'
    ),
    list(
        name='variant',
        data_type='string',
        entity_type='variant',
        is_list=TRUE,
        expression='entity_ids("variant", record.variant)'
    ),
    list(
        name='author',
        data_type='object',
        expression='user()'
    ),
    list(
        name='date',
        data_type='string',
        expression='today()'
    )
)

dataset = Dataset.get_or_create_by_full_path('<NEW DATASET>', fields=fields)

# Add a comment
record = list(
    comment='This variant has been manually curated as pathogenic according to our internal guidelines.',
    variant='NP_009228.2:p.Ser993Asn'
)

DatasetImport.create(
    dataset_id=dataset$id,
    data_records=list(record),
    target_fields=Dataset.template(dataset$id)$fields
    )