Skip to content

Commit d726fb2

Browse files
committed
Validate chromosomes according to a strict whitelist
This is intended to fix an issue where non-categorical chrom names caused Tabix to do Bad Things. (more permissive validators kept getting bypassed, and it was affecting quality of service for users)
1 parent 4c6f1ac commit d726fb2

File tree

10 files changed

+56
-23
lines changed

10 files changed

+56
-23
lines changed

assets/js/pages/gwas_upload.js

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,8 @@ modal.$on('has_options', function (parser_options) { // Close with options selec
164164
let is_valid;
165165
try {
166166
const sample_data = rows.slice(first_data_index).map(row => parser(row));
167+
// Note: in future, may want to reflect the "allowed chromosomes whitelist" in frontend validator
168+
// (for now, there's a slight gap between what the web UI and the server check when giving feedback)
167169
is_valid = validateSortOrder(sample_data);
168170
} catch (e) {
169171
// TODO: Improve UI representation of other parsing errors, like AF out of range

config/settings/base.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -317,8 +317,10 @@
317317
'TEST_REQUEST_DEFAULT_FORMAT': 'vnd.api+json'
318318
}
319319

320-
# The maximum region size (bp) for a single Locuszoom plot. This is used to prevent ginormous API calls.
321-
LZ_MAX_REGION_SIZE = 1_000_000
320+
# The maximum region size (bp) for a single Locuszoom plot. This is used to prevent ginormous API calls. Value is set
321+
# for JS in LocalZoom, separately.
322+
LZ_MAX_REGION_SIZE = 2_000_000
323+
322324
# The "official" domain name. This is set in a .env file, and it must exactly match the base url registered as part of
323325
# your OAuth provider configuration (eg callback urls). It should be a domain, not an IP.
324326
LZ_OFFICIAL_DOMAIN = env('LZ_OFFICIAL_DOMAIN', default='my.locuszoom.org')

locuszoom_plotting_service/api/views.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ def _query_params(self) -> ty.Tuple[str, int, int]:
132132
raise drf_exceptions.ParseError('"end" position must be greater than "start"')
133133

134134
if not (0 <= (end - start) <= settings.LZ_MAX_REGION_SIZE):
135-
raise drf_exceptions.ParseError(f'Cannot handle requested region size. Max allowed is {500_000}')
135+
raise drf_exceptions.ParseError(
136+
f'Cannot handle requested region size. Max allowed is {settings.LZ_MAX_REGION_SIZE}')
136137

137138
return chrom, start, end

locuszoom_plotting_service/templates/pages/about.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ <h3 class="faq" id="prepare-data">How should I prepare my data for uploading?</h
7878
<li>By marker (chrom_pos:ref/alt): <i>9:22125503_G/C</i>, OR</li>
7979
<li>By individual columns (preferred)
8080
<ul>
81-
<li>Chromosome</li>
81+
<li>Chromosome (1-25, X, Y, M, or MT; please contact us if you need other chromosome names)</li>
8282
<li>Position</li>
8383
<li>Ref. allele (according to human reference genome) (optional for plots, but required for LD)</li>
8484
<li>Alt. allele (according to human reference genome) (optional for plots, but required for LD)</li>

package-lock.json

Lines changed: 10 additions & 10 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
"d3": "^5.16.0",
3232
"d3-tip": "0.9.1",
3333
"gwas-credible-sets": "^0.1.0",
34-
"localzoom": "git+https://github.com/statgen/localzoom.git#b78103b",
34+
"localzoom": "git+https://github.com/statgen/localzoom.git#84215bf",
3535
"locuszoom": "0.14.0",
3636
"lodash": "^4.17.21",
3737
"pako": "^1.0.11",

requirements/base.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,5 +25,5 @@ pyyaml # Required for OpenAPI rendering
2525
boltons~=20.2.1
2626
scipy~=1.5.3
2727
python-magic==0.4.18
28-
zorp[perf,lookups]==0.3.4
28+
zorp[perf,lookups]==0.3.6
2929
genelocator==1.1.2

tests/ingest/test_validators.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,15 +55,19 @@ def test_positions_not_sorted(self):
5555
with pytest.raises(val_exc.ValidationException):
5656
validators.standard_gwas_validator._validate_contents(reader)
5757

58-
def test_rejects_rsids(self):
58+
def test_whitelists_chroms(self):
5959
reader = sniffers.guess_gwas_generic([
6060
"#chrom\tpos\tref\talt\tneg_log_pvalue",
6161
"X\t1\tA\tC\t7.3",
62-
"rs1100\t2\tA\tC\t7.3",
63-
"rs2521\t1\tA\tC\t7.3",
62+
"chr1\t2\tA\tC\t7.3",
63+
"invalid\t1\tA\tC\t7.3",
6464
])
6565

66-
with pytest.raises(val_exc.ValidationException, match='is an rsID'):
66+
with pytest.raises(
67+
val_exc.ValidationException,
68+
match="Chromosome INVALID is not a valid chromosome name. Must be "
69+
"one of: '1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 M MT X Y'"
70+
):
6771
validators.standard_gwas_validator._validate_contents(reader)
6872

6973
def test_validates_for_file(self):

util/ingest/helpers.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
import functools
66
import logging
7+
import re
8+
import typing as ty
79

810
from . import exceptions
911
from zorp import exceptions as z_exc
@@ -29,3 +31,10 @@ def wrapper(*args, **kwargs):
2931
logger.exception('Task failed due to unexpected error')
3032
raise exceptions.UnexpectedIngestException
3133
return wrapper
34+
35+
36+
def natural_sort(items: ty.Iterable):
37+
"""Natural sort a list of strings. Used for human-friendly error messages, eg, from a `set` of allowed strings"""
38+
convert = lambda text: int(text) if text.isdigit() else text.lower() # noqa: E731
39+
alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)] # noqa: E731
40+
return sorted(items, key=alphanum_key)

util/ingest/validators.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,15 @@
1414

1515
logger = logging.getLogger(__name__)
1616

17+
# Whitelist of allowed chromosomes. It's ok to add more values, as long as we have some kind of whitelist.
18+
# The generic parser uses these as a safeguard, because when people slip a non-categorical value into the chrom field,
19+
# tabix uses all the RAM on the system and then crashes horribly. All our looser heuristics
20+
ALLOWED_CHROMS = frozenset({
21+
'1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20',
22+
'21', '22', '23', '24', '25',
23+
'X', 'Y', 'M', 'MT'
24+
})
25+
1726

1827
class _GwasValidator:
1928
"""Validate a raw GWAS file as initially uploaded (given filename and instructions on how to parse it)"""
@@ -52,12 +61,18 @@ def _validate_data_rows(self, reader) -> bool:
5261
for cp, tied_variants in cp_groups:
5362
cur_chrom = cp[0]
5463

55-
if cur_chrom.startswith('RS'): # Parser always capitalizes chrom names
56-
raise v_exc.ValidationException(f'Invalid chromosome specified: value "{cur_chrom}" is an rsID')
64+
# Prevent server issues by imposting strict limits on what chroms are allowed
65+
if cur_chrom not in ALLOWED_CHROMS:
66+
options = ' '.join(helpers.natural_sort(ALLOWED_CHROMS))
67+
raise v_exc.ValidationException(
68+
f"Chromosome {cur_chrom} is not a valid chromosome name. Must be one of: '{options}'")
5769

5870
if cur_chrom == prev_chrom and cp[1] < prev_pos:
5971
# Positions not in correct order for Pheweb to use
60-
raise v_exc.ValidationException(f'Positions must be sorted prior to uploading. Position chr{cur_chrom}:{cp[1]} should not follow chr{prev_chrom}:{prev_pos}')
72+
raise v_exc.ValidationException(
73+
f'Positions must be sorted prior to uploading. '
74+
f'Position chr{cur_chrom}:{cp[1]} should not follow chr{prev_chrom}:{prev_pos}'
75+
)
6176

6277
if cur_chrom != prev_chrom:
6378
if cur_chrom in chrom_seen:

0 commit comments

Comments
 (0)