Skip to content

Annotation Examples

Annotate with Dataset Queries

SolveBio makes it possible to annotate a field with data from any other dataset. You can run any dataset query in an expression and use it to annotate a field.

In this example, we have a list of variants in the SolveBio variant ID format. We will annotate them with information about dataset presence (where has this variant been seen?), clinical significance, population allele frequencies, and prevalence in cancer types. You can easily add your own annotation datasets by importing them into SolveBio and querying them in an expression.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import solvebio
from solvebio import Annotator

solvebio.login()

# In this example we will annotate a small list
# of records in real time, but you can also
# annotate entire datasets.
records = [
    {'variant': 'GRCH38-13-32330950-32330950-AAAG'},
    {'variant': 'GRCH38-7-117559590-117559593-A'},
    {'variant': 'GRCH38-17-43092412-43092412-T'},
]

fields = [
    {
        # What's the clinical significance of the variant?
        'name': 'clinical_significance',
        'data_type': 'string',
        'is_list': False,
        # The clinical_significance field contains a list. Here we take the first value.
        'expression': """
        get(dataset_field_values(
            'solvebio:public:/ClinVar/5.0.0-20190805/Variants-GRCH38',
            field='clinical_significance',
            entities=[('variant', record.variant)]
        ), 0)
        """
    },
    {
        # What's the 1000 Genomes allele frequency?
        'name': '1000g_af',
        'data_type': 'double',
        'is_list': False,
        'expression': """
        dataset_field_values(
            'solvebio:public:/1000G/1.1.0-2015-01-08/1000G-GRCh37',
            field='allele_frequency',
            entities=[('variant', crossmap(record.variant, "GRCh37"))]
        )
        """
    },
    {
        # What public datasets is this variant in?
        'name': 'beacons',
        'data_type': 'string',
        'is_list': True,
        'expression': """
        beacon(
            record.variant,
            'variant',
            visibility='public'
        )['found']
        """
    }
]

# The output of annotate() is an iterator.
for r in Annotator(fields).annotate(records):
    print(r)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
require(solvebio)

# In this example we will annotate a small list
# of records in real time, but you can also
# annotate entire datasets.
records = list(
    list(variant='GRCH38-13-32330950-32330950-AAAG'),
    list(variant='GRCH38-7-117559590-117559593-A'),
    list(variant='GRCH38-17-43092412-43092412-T')
)

fields = list(
    list(
        # What's the clinical significance of the variant?
        name='clinical_significance',
        data_type='string',
        is_list=FALSE,
        # The clinical_significance field contains a list. Here we take the first value.
        expression="
        get(dataset_field_values(
            'solvebio:public:/ClinVar/5.0.0-20190805/Variants-GRCH38',
            field='clinical_significance',
            entities=[('variant', record.variant)]
        ), 0)
        "
    ),
    list(
        # What's the 1000 Genomes allele frequency?
        name='1000g_af',
        data_type='double',
        is_list=FALSE,
        expression="
        dataset_field_values(
            'solvebio:public:/1000G/1.1.0-2015-01-08/1000G-GRCh37',
            field='allele_frequency',
            entities=[('variant', record.variant)]
        )
        "
    ),
    list(
        # What public datasets is this variant in?
        name='beacons',
        data_type='string',
        is_list=TRUE,
        expression="
        beacon(
            record.variant,
            'variant',
            visibility='public'
        )['found']
        "
    )
)

# Returns a data frame
Annotator.annotate(records=records, fields=fields)

Harmonize Entity Identifiers

If you have a list of variants in HGVS or VCF format, you can use an expression to normalize them for easy comparison. In the following example, all variants are actually the same, they were just processed by different bioinformatics pipelines:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from solvebio import Annotator

records = [
    {'variant': 'chr17 41244429 . C T .'},
    {'variant': 'NC_000017.10:g.41244429C>T'},
    {'variant': 'NM_007299.3:c.788-1380G>A'},
    {'variant': 'NP_009225.1:p.Ser1040Asn'},
    {'variant': 'NP_009228.2:p.Ser993Asn'},
    # NOTE: rsID variants can represent multiple alleles
    {'variant': 'rs4986852'},
]

fields = [
    {
        'name': 'variant_normalized',
        'data_type': 'string',
        'is_list': False,
        'expression': "entity_ids('variant', record.variant)"
    }
]

for r in Annotator(fields).annotate(records):
    print(r['variant'], ' -> ', r['variant_normalized']))))

# chr17 41244429 . C T .        ->  GRCH38-17-41244429-41244429-T
# NC_000017.10:g.41244429C>T    ->  GRCH38-17-43092412-43092412-T
# NM_007299.3:c.788-1380G>A     ->  GRCH38-17-43092412-43092412-T
# NP_009225.1:p.Ser1040Asn      ->  GRCH38-17-43092412-43092412-T
# NP_009228.2:p.Ser993Asn       ->  GRCH38-17-43092412-43092412-T
# rs4986852                     ->  GRCH38-17-43092412-43092412-T
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
require(solvebio)

records = list(
    list(variant='chr17 41244429 . C T .'),
    list(variant='NC_000017.10:g.41244429C>T'),
    list(variant='NM_007299.3:c.788-1380G>A'),
    list(variant='NP_009225.1:p.Ser1040Asn'),
    list(variant='NP_009228.2:p.Ser993Asn'),
    # NOTE: rsID variants can represent multiple alleles
    list(variant='rs4986852')
)

fields = list(
    list(
        name='variant_normalized',
        data_type='string',
        is_list=FALSE,
        expression="entity_ids('variant', record.variant)"
    )
)

annotated <- Annotator.annotate(records=records, fields=fields)
# chr17 41244429 . C T .        ->  GRCH38-17-41244429-41244429-T
# NC_000017.10:g.41244429C>T    ->  GRCH38-17-43092412-43092412-T
# NM_007299.3:c.788-1380G>A     ->  GRCH38-17-43092412-43092412-T
# NP_009225.1:p.Ser1040Asn      ->  GRCH38-17-43092412-43092412-T
# NP_009228.2:p.Ser993Asn       ->  GRCH38-17-43092412-43092412-T
# rs4986852                     ->  GRCH38-17-43092412-43092412-T

Variant Effect Prediction

SolveBio provides built-in support for running variant effect prediction using Veppy, an open source tool inspired by SnpEff and VEP. You can run it in a simple expression, which returns a list of predicted effects on known transcripts:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
from solvebio import Annotator

records = [
    {'variant': 'GRCH38-13-32330950-32330950-AAAG'},
    {'variant': 'GRCH38-7-117559590-117559593-A'},
    {'variant': 'GRCH38-17-43092412-43092412-T'},
]

fields = [
    {
        'name': 'effects',
        'data_type': 'object',
        'is_list': True,
        'expression': "predict_variant_effects(record.variant)"
    }
]

for r in Annotator(fields).annotate(records):
    print(r['variant'], '->', [(eff['so_term'], eff['transcript']) for eff in r['effects']])

# GRCH38-13-32330950-32330950-AAAG  -> [(u'frameshift_variant', u'NM_000059.3'), ... ]
# GRCH38-7-117559590-117559593-A    -> [(u'missense_variant', u'NM_007297.3'), ... ]
# GRCH38-17-43092412-43092412-T     -> [(u'missense_variant', u'NM_005228.3'), ... ]
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
require(solvebio)

records = list(
    list(variant='GRCH38-13-32330950-32330950-AAAG'),
    list(variant='GRCH38-7-117559590-117559593-A'),
    list(variant='GRCH38-17-43092412-43092412-T')
)

fields = list(
    list(
        name='effects',
        data_type='object',
        is_list=TRUE,
        expression="predict_variant_effects(record.variant)"
    )
)

annotated <- Annotator.annotate(records=records, fields=fields)
# GRCH38-13-32330950-32330950-AAAG  -> [(u'frameshift_variant', u'NM_000059.3'), ... ]
# GRCH38-7-117559590-117559593-A    -> [(u'missense_variant', u'NM_007297.3'), ... ]
# GRCH38-17-43092412-43092412-T     -> [(u'missense_variant', u'NM_005228.3'), ... ]

Merge a List of Entities

Sometimes you'll want to combine a list of entities (variants, genes, patients, samples) from a few different datasets. Using a dataset migrations it's possible to combine these into a unique list, ready for further annotation. This particular workflow can only be performed as a dataset migration in which records are copied from one or more source datasets into a new target dataset.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import solvebio
from solvebio import Dataset
from solvebio import Filter
solvebio.login()


def merge_variants(source, target, filters=None):
    # Get the fields containing variants and genes,
    # requires "entity_type" to be set on the fields.
    variant_field = next(source.fields(entity_type='variant'))

    target_fields = [
        {
            'name': '_id',
            'expression': 'crossmap(record["%s"].upper(), "GRCH38")' % variant_field.name
        },
        {
            'name': 'variant',
            'entity_type': 'variant',
            'expression': 'record["_id"]',
            'depends_on': ['_id']
        },
        {
            'name': 'sources',
            'data_type': 'object',
            'expression': '{"%s": True}' % source.vault_object_filename
        },
    ]

    # Query with the optional filters, and exclude records without variants
    non_empty = ~ (Filter(**{variant_field.name: None}) &
                   Filter(**{variant_field.name: ''}))
    query = source.query(fields=[variant_field.name, '_id'])\
        .filter(filters)\
        .filter(non_empty)
    # Preview the annotation:
    # print(query.annotate(target_fields).next())
    return query.migrate(target, target_fields=target_fields,
                         commit_mode='upsert', follow=False)


target = Dataset.get_or_create_by_full_path('~/My Merged Dataset')

# Add all Pathogenic variants from ClinVar
source = Dataset.get_by_full_path('solvebio:public:/ClinVar/5.0.0-20190805/Variants-GRCH38')
filters = Filter(clinical_significance="pathogenic")
merge_variants(source, target, filters)

# Add all BRAF variants from ICGC (TCGA)
source = Dataset.get_by_full_path('solvebio:public:/ICGC/3.0.0-23/SimpleSomaticMutation-GRCh37')
filters = Filter(gene_symbol='BRAF')
merge_variants(source, target, filters)

# After a few minutes, you should see a list of variants in the target dataset.
target.activity(follow=True)

filters = Filter(**{'sources.SimpleSomaticMutation-GRCh37__exact':True}) & Filter(**{'sources.Variants-GRCH38':True})
query = target.query(filters=filters)
print("BRAF variants in ICGC that are also classified as pathogenic in ClinVar")
for row in query:
    print(row)

Track Changes to a Dataset

Variant curation is a common workflow. The ability to take free-text notes and link them to "canonical entities" such as variants, makes it possible to easily create a knowledge-base that can be shared and collaborated on.

In this example, we create the framework for a simple "comments" dataset that keeps track of the person that made a change, the date/time it was made, their comment, and the canonical variant ID (automatically harmonized):

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import solvebio
from solvebio import Dataset
from solvebio import DatasetImport
solvebio.login()


fields = [
    {
        'name': 'comment',
        'data_type': 'text'
    },
    {
        'name': 'variant',
        'data_type': 'string',
        'entity_type': 'variant',
        'is_list': True,
        'expression': 'entity_ids("variant", record.variant)'
    },
    {
        'name': 'author',
        'data_type': 'object',
        'expression': 'user()',
    },
    {
        'name': 'date',
        'data_type': 'string',
        'expression': 'today()'
    },
]

dataset = Dataset.get_or_create_by_full_path('<NEW DATASET>', fields=fields)

# Add a comment
record = {
    'comment': 'This variant has been manually curated as pathogenic according to our internal guidelines.',
    'variant': 'NP_009228.2:p.Ser993Asn'
}

imp = DatasetImport.create(
    dataset_id=dataset.id,
    data_records=[record],
    target_fields=dataset.template()['fields']
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
require(solvebio)


fields = list(
    list(
        name='comment',
        data_type='text'
    ),
    list(
        name='variant',
        data_type='string',
        entity_type='variant',
        is_list=TRUE,
        expression='entity_ids("variant", record.variant)'
    ),
    list(
        name='author',
        data_type='object',
        expression='user()'
    ),
    list(
        name='date',
        data_type='string',
        expression='today()'
    )
)

dataset = Dataset.get_or_create_by_full_path('<NEW DATASET>', fields=fields)

# Add a comment
record = list(
    comment='This variant has been manually curated as pathogenic according to our internal guidelines.',
    variant='NP_009228.2:p.Ser993Asn'
)

DatasetImport.create(
    dataset_id=dataset$id,
    data_records=list(record),
    target_fields=Dataset.template(dataset$id)$fields
    )

Exploding and Melting Records

Sometimes you come across fields that have multiple values instead of a single value. That case could be handled with explode and melt expression functions.

In order to use them, two additional parameters pre_annotation_expression and post_annotation_expression have been added to the Annotate settings which is used by imports, exports, migrations and queries.

  • pre_annotation_expression takes an expression that operates on each record at once BEFORE annotating each field.

  • post_annotation_expression takes an expression that runs AFTER annotating each field, before yielding the record to the caller.

If either expression returns a list of objects, those objects become new records in the output. The outputs MUST be one dict or a list of dicts, otherwise the annotator will fail on the record.

The expression can also act as a filter, by returning an empty list or None based on the expression. For example, you can prevent certain records from being indexed by setting pre_annotation_expression to record if record.value > 2 else None which will only index records where the value field is greater than 2.

The explode function

The explode function splits N values from M list fields into N records, e.g.

explode({'a': [1, 2, 3]}, fields=['a']) -> [{'a': 1}, {'a': 2}, {'a': 3}]

Example: explode a list of values generated by an expression

In some cases, expressions may result in records with list values. For example, the predict_variant_effects() function may return a list of effects, one per transcript.

In order to ensure that each effect becomes its own record in the output, we can call explode() from the post_annotation_expression (since we want it to run after predicting the effects):

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
records = [{'_id': '0', 'variant': 'GRCh37-1-236590816-236590816-T'},
           {'_id': '1', 'variant': 'GRCh37-1-236154365-236154365-G'}]

fields = [{
    "name": "effects",
    "expression": "predict_variant_effects(record.variant)",
    "data_type": "object",
    "is_list": True
}]

for r in Annotator(fields=fields).annotate(records, post_annotation_expression='explode(record, fields=["effects"])'):
    print(r)

# Output:
# {'_id': '0_0', 'variant': 'GRCh37-1-236590816-236590816-T', 'effects': {'impact': 'NONE', 'lof': None, 'so_term': 'no_sequence_alteration', 'intron': None, 'so_accession': '', 'exon': None, 'gene': 'EDARADD', 'transcript': 'NM_145861.2'}}
# {'_id': '0_1', 'variant': 'GRCh37-1-236590816-236590816-T', 'effects': {'impact': 'NONE', 'lof': None, 'so_term': 'no_sequence_alteration', 'intron': None, 'so_accession': '', 'exon': None, 'gene': 'EDARADD', 'transcript': 'NM_080738.3'}}
# {'_id': '1_0', 'variant': 'GRCh37-1-236154365-236154365-G', 'effects': {'impact': 'NONE', 'lof': None, 'so_term': 'no_sequence_alteration', 'intron': None, 'so_accession': '', 'exon': None, 'gene': 'NID1', 'transcript': 'NM_002508.2'}}

If _id is in the original record, each new record will have an integer appended to the _id with the index of each exploded record. The field schema does not change with explode. Instead, the list field becomes a single value field.

Explode on multiple fields also works, but each field is expanded simultaneously, not "combinatorically", e.g.

explode({'a': [1, 2, 3], 'b': [4, 5]}, fields=['a', 'b']) -> [{'a': 1, 'b': 4}, {'a': 2, 'b': 5}, {'a': 3, 'b': None}]

The melt function

The melt function converts one or more fields into 1 NEW key field and 1 NEW value field across 1+ new records. All fields must have the same data type, e.g.

melt([{'a': 1}, {'a': 2}], fields=['a']) -> [{'key': 'a', 'value': 1}, {'key': 'a', 'value': 2}]