Skip to content

Commit

Permalink
Some tweaks and fixes for the UniProt Qleverfile
Browse files Browse the repository at this point in the history
  • Loading branch information
Hannah Bast committed Dec 4, 2024
1 parent 0dc775f commit 0b78e31
Showing 1 changed file with 9 additions and 9 deletions.
18 changes: 9 additions & 9 deletions src/qlever/Qleverfiles/Qleverfile.uniprot
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,25 @@
# qlever index # takes ~ 40 hours and ~ 60 GB RAM (on an AMD Ryzen 9 9950X)
# qlever start # starts the server (takes a few seconds)
#
# Install packages: sudo apt install -y libxml2-utils parallel xz-utils pv
# Install packages: sudo apt install -y libxml2-utils parallel xz-utils wget
# Install manually: Apache Jena binaries (https://dlcdn.apache.org/jena/binaries)
#
# Set DATE to the date of the latest release. Build on SSD (requires ~ 7 TB
# during build, ~ 3 TB after build). When running the server, the files
# `uniprot.index.???.meta` can be on HDD without significant performance loss.
# during build, ~ 3 TB after build).

[data]
NAME = uniprot
DATE = 2024-11-27
RDFXML_DIR = rdf.${DATE}
TTL_DIR = ttl.${DATE}
UNIPROT_URL = https://ftp.uniprot.org/pub/databases/uniprot/current_release/rdf
RHEA_URLS = https://ftp.expasy.org/databases/rhea/rdf/chebi.owl.gz https://ftp.expasy.org/databases/rhea/rdf/rhea.rdf.gz
GET_EXAMPLES_CMD = mkdir -p ${TTL_DIR} && git clone [email protected]:sib-swiss/sparql-examples.git && (cd sparql-examples && ./convertToOneTurtle.sh -p uniprot && mv -f examples_uniprot.ttl ../${TTL_DIR} && cd .. && rm -rf sparql-examples)
GET_RDFXML_CMD = mkdir -p ${RDFXML_DIR} && (echo "${RHEA_URLS}" | tr " " "\n"; curl -s ${UNIPROT_URL}/RELEASE.meta4 | sed "s/<metalink.*/<metalink>/" | xmllint --xpath "/metalink/files/file/url[@location=\"ch\"]/text()" -) | while read URL; do wget --no-verbose -P ${RDFXML_DIR} $$URL 2>&1 | tee -a uniprot.download-log; done
RHEA_URL = https://ftp.expasy.org/databases/rhea/rdf
EXAMPLES_URL = https://github.com/sib-swiss/sparql-examples
GET_EXAMPLES_CMD = mkdir -p ${TTL_DIR} && git clone ${EXAMPLES_URL} && (cd sparql-examples && ./convertToOneTurtle.sh -p uniprot && gzip examples_uniprot.ttl && mv -f examples_uniprot.ttl.gz ../${TTL_DIR} && cd .. && rm -rf sparql-examples)
GET_RDFXML_CMD = mkdir -p ${RDFXML_DIR} && (echo "${RHEA_URL}/chebi.owl.gz"; echo "${RHEA_URL}/rhea.rdf.gz"; curl -s ${UNIPROT_URL}/RELEASE.meta4 | sed "s/<metalink.*/<metalink>/" | xmllint --xpath "/metalink/files/file/url[@location=\"ch\"]/text()" -) | while read URL; do wget --no-verbose -P ${RDFXML_DIR} $$URL 2>&1 | tee -a uniprot.download-log; done
RDFXML2TTL_CMD = mkdir -p ${TTL_DIR} && for RDFXML in ${RDFXML_DIR}/*.{owl,owl.xz,rdf,rdf.xz}; do echo "xzcat -f $$RDFXML | rdfxml --output=ttl -q 2> ${TTL_DIR}/$$(basename $$RDFXML).stderr | gzip -c > ${TTL_DIR}/$$(basename $$RDFXML | sed 's/\(rdf\|rdf.xz\|owl\|owl.xz\)$$/ttl.gz/') && echo 'DONE converting $$RDFXML'"; done | parallel
GET_DATA_CMD = date > ${NAME}.get-data.begin-date && ${GET_EXAMPLES_CMD} && ${GET_RDFXML_CMD} && ${RDFXML2TTL_CMD} && date > ${NAME}.get-data.end-date
DESCRIPTION = Complete UniProt data from ${UNIPROT_URL}, with additional data from https://ftp.expasy.org/databases/rhea/rdf
GET_DATA_CMD = date > ${NAME}.get-data.begin-date && ${GET_EXAMPLES_CMD} && ${GET_RDFXML_CMD} && ${RDFXML2TTL_CMD} && date > ${NAME}.get-data.end-date
DESCRIPTION = Complete UniProt data from ${UNIPROT_URL}, with additional data from ${RHEA_URL} and ${EXAMPLES_URL}

[index]
INPUT_FILES = ${data:TTL_DIR}/*.ttl.gz
Expand Down Expand Up @@ -54,7 +54,7 @@ MULTI_INPUT_JSON = [{ "cmd": "zcat {}", "graph": "http://sparql.uniprot.org/unip
{ "cmd": "zcat ${data:TTL_DIR}/rhea.ttl.gz", "graph": "https://sparql.rhea-db.org/rhea" },
{ "cmd": "zcat ${data:TTL_DIR}/examples_uniprot.ttl.gz", "graph": "http://sparql.uniprot.org/.well-known/sparql-examples" },
{ "cmd": "zcat ${data:TTL_DIR}/core.ttl.gz", "graph": "http://purl.uniprot.org/core" },
{ "cmd": "zcat void.ttl.gz", "graph": "http://rdfs.org/ns/void" }]
{ "cmd": "zcat ${data:TTL_DIR}/void.ttl.gz", "graph": "http://rdfs.org/ns/void" }]
SETTINGS_JSON = { "languages-internal": [], "prefixes-external": [""], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": true, "num-triples-per-batch": 25000000 }
STXXL_MEMORY = 60G

Expand Down

0 comments on commit 0b78e31

Please sign in to comment.