Skip to content

Commit fee1f36

Browse files
Merge pull request #467 from micheldumontier/master
update to a few scripts
2 parents bdb885f + f232ac1 commit fee1f36

File tree

3 files changed

+135
-153
lines changed

3 files changed

+135
-153
lines changed

goa/goa.php

+67-73
Original file line numberDiff line numberDiff line change
@@ -238,17 +238,12 @@ function process($file){
238238

239239

240240
function getAspect($anAspect){
241-
if(count($anAspect)){
242-
if($anAspect == "F"){
243-
return "function";
244-
}elseif($anAspect == "P"){
245-
return "process";
246-
}elseif($anAspect == "C"){
247-
return "component";
248-
}
249-
250-
}else{
251-
return null;
241+
if($anAspect == "F"){
242+
return "function";
243+
}elseif($anAspect == "P"){
244+
return "process";
245+
}elseif($anAspect == "C"){
246+
return "component";
252247
}
253248
}
254249

@@ -283,73 +278,72 @@ function getdbURI($db_id, $db_object_id){
283278
* ["Experimental Evidence Code" => ["Inferred from Experiment", "0000006"]]
284279
* See: http://www.geneontology.org/GO.evidence.shtml
285280
**/
286-
function getEvidenceCodeLabelArr($aec){
287-
if(count($aec)){
288-
//experimental code
289-
$ec = array(
290-
"EXP"=> array("Inferred from Experiment","0000006"),
291-
"IDA"=> array("Inferred from Direct Assay","0000314"),
292-
"IPI"=> array("Inferred from Physical Interaction","0000021"),
293-
"IMP"=> array("Inferred from Mutant Phenotype", "0000315"),
294-
"IGI"=> array("Inferred from Genetic Interaction","0000316"),
295-
"IEP"=> array("Inferred from Expression Pattern", "0000008")
296-
);
297-
298-
$htp = array(
299-
"HTP" => array("Inferred from High Throughput Experiment",""),
300-
"HDA" => array("Inferred from High Throughput Direct Assay",""),
301-
"HMP" => array("Inferred from Hight Throughput Mutant Phenotype",""),
302-
"HGI" => array("Inferred from High Throughput Genetic Interaction",""),
303-
"HEP" => array("Inferred from High Throughput Expression Pattern","")
281+
function getEvidenceCodeLabelArr($aec)
282+
{
283+
//experimental code
284+
$ec = array(
285+
"EXP"=> array("Inferred from Experiment","0000006"),
286+
"IDA"=> array("Inferred from Direct Assay","0000314"),
287+
"IPI"=> array("Inferred from Physical Interaction","0000021"),
288+
"IMP"=> array("Inferred from Mutant Phenotype", "0000315"),
289+
"IGI"=> array("Inferred from Genetic Interaction","0000316"),
290+
"IEP"=> array("Inferred from Expression Pattern", "0000008")
304291
);
305292

306-
//computational analysis codes
307-
$cac = array(
308-
"ISS"=> array("Inferred from Sequence or Structural Similarity","0000027"),
309-
"ISO"=> array("Inferred from Sequence Orthology", "0000201"),
310-
"ISA"=> array("Inferred from Sequence Alignment", "0000200"),
311-
"ISM"=> array("Inferred from Sequence Model", "0000202"),
312-
"IGC"=> array("Inferred from Genomic Context", "0000317"),
313-
"IBA"=> array("Inferred from Biological aspect of Ancestor","0000318"),
314-
"IBD"=> array("Inferred from Biological aspect of Desendant", "0000319"),
315-
"IKR"=> array("Inferred from Key Residues","0000320"),
316-
"IRD"=> array("Inferred from Rapid Divergence","0000321"),
317-
"RCA"=> array("Inferred from Reviewed Computational Analysis","0000245")
318-
);
319-
320-
//author statement codes
321-
$asc = array(
322-
"TAS"=> array("Traceable Author Statement","0000304"),
323-
"NAS"=> array("Non-Traceable Author Statement","0000303")
324-
);
325-
//curator statement codes
326-
$csc = array(
327-
"IC"=> array("Inferred by Curator","0000001"),
328-
"ND"=> array("No biological Data available","0000035")
329-
);
330-
//automatically assigned codes
331-
$aac = array(
332-
"IEA"=>array("Inferred from Electronic Annotation", "0000203")
293+
$htp = array(
294+
"HTP" => array("Inferred from High Throughput Experiment",""),
295+
"HDA" => array("Inferred from High Throughput Direct Assay",""),
296+
"HMP" => array("Inferred from Hight Throughput Mutant Phenotype",""),
297+
"HGI" => array("Inferred from High Throughput Genetic Interaction",""),
298+
"HEP" => array("Inferred from High Throughput Expression Pattern","")
299+
);
300+
301+
//computational analysis codes
302+
$cac = array(
303+
"ISS"=> array("Inferred from Sequence or Structural Similarity","0000027"),
304+
"ISO"=> array("Inferred from Sequence Orthology", "0000201"),
305+
"ISA"=> array("Inferred from Sequence Alignment", "0000200"),
306+
"ISM"=> array("Inferred from Sequence Model", "0000202"),
307+
"IGC"=> array("Inferred from Genomic Context", "0000317"),
308+
"IBA"=> array("Inferred from Biological aspect of Ancestor","0000318"),
309+
"IBD"=> array("Inferred from Biological aspect of Desendant", "0000319"),
310+
"IKR"=> array("Inferred from Key Residues","0000320"),
311+
"IRD"=> array("Inferred from Rapid Divergence","0000321"),
312+
"RCA"=> array("Inferred from Reviewed Computational Analysis","0000245")
333313
);
334314

335-
if(array_key_exists($aec, $ec)){
336-
return array("experimental evidence code"=>$ec[$aec]);
337-
}elseif(array_key_exists($aec, $htp)){
338-
return array("high throughput code"=>$htp[$aec]);
339-
}elseif(array_key_exists($aec, $cac)){
340-
return array("computational analysis code"=>$cac[$aec]);
341-
}elseif(array_key_exists($aec, $asc)){
342-
return array("author statement code"=>$asc[$aec]);
343-
}elseif(array_key_exists($aec, $csc)){
344-
return array("curator statement code"=>$csc[$aec]);
345-
}elseif(array_key_exists($aec, $aac)){
346-
return array("automatically assigned code"=>$aac[$aec]);
347-
}else{
348-
return array("unmapped evidence code"=> $aec);
349-
}
350-
} else {
351-
return null;
315+
//author statement codes
316+
$asc = array(
317+
"TAS"=> array("Traceable Author Statement","0000304"),
318+
"NAS"=> array("Non-Traceable Author Statement","0000303")
319+
);
320+
//curator statement codes
321+
$csc = array(
322+
"IC"=> array("Inferred by Curator","0000001"),
323+
"ND"=> array("No biological Data available","0000035")
324+
);
325+
//automatically assigned codes
326+
$aac = array(
327+
"IEA"=>array("Inferred from Electronic Annotation", "0000203")
328+
);
329+
330+
if(array_key_exists($aec, $ec)){
331+
return array("experimental evidence code"=>$ec[$aec]);
332+
}elseif(array_key_exists($aec, $htp)){
333+
return array("high throughput code"=>$htp[$aec]);
334+
}elseif(array_key_exists($aec, $cac)){
335+
return array("computational analysis code"=>$cac[$aec]);
336+
}elseif(array_key_exists($aec, $asc)){
337+
return array("author statement code"=>$asc[$aec]);
338+
}elseif(array_key_exists($aec, $csc)){
339+
return array("curator statement code"=>$csc[$aec]);
340+
}elseif(array_key_exists($aec, $aac)){
341+
return array("automatically assigned code"=>$aac[$aec]);
342+
}else{
343+
return array("unmapped evidence code"=> $aec);
352344
}
345+
346+
return null;
353347
}
354348

355349
function parseDate($str){

hgnc/hgnc.php

+22-11
Original file line numberDiff line numberDiff line change
@@ -324,19 +324,30 @@ function process(){
324324
}
325325
}
326326
if(!empty($r[$h['gene_family_id']])){
327-
$s = $r[$h['gene_family_id']];
328-
parent::AddRDF(
329-
parent::triplifyString($id_res, $this->getVoc()."gene-family-tag", utf8_encode(htmlspecialchars($s))).
330-
parent::describeProperty($this->getVoc()."gene-family-tag", "Gene Family Tag","Tag used to designate a gene family or group the gene has been assigned to, according to either sequence similarity or information from publications, specialist advisors for that family or other databases. Families/groups may be either structural or functional, therefore a gene may belong to more than one family/group. These tags are used to generate gene family or grouping specific pages at genenames.org and do not necessarily reflect an official nomenclature. Each gene family has an associated gene family tag and gene family description. If a particular gene is a member of more than one gene family, the tags and the descriptions will be shown in the same order.")
327+
parent::addRDF(
328+
parent::describeProperty($this->getVoc()."gene-family", "gene family",null,null, "Used to designate a gene family or group the gene has been assigned to, according to either sequence similarity or information from publications, specialist advisors for that family or other databases. Families/groups may be either structural or functional, therefore a gene may belong to more than one family/group. These tags are used to generate gene family or grouping specific pages at genenames.org and do not necessarily reflect an official nomenclature. Each gene family has an associated gene family tag and gene family description. If a particular gene is a member of more than one gene family, the tags and the descriptions will be shown in the same order.").
329+
parent::describeClass($this->getVoc()."Gene-Family", "gene family", null,null,"Name given to a particular gene family. The gene family description has an associated gene family tag. Gene families are used to group genes according to either sequence similarity or information from publications, specialist advisors for that family or other databases. Families/groups may be either structural or functional, therefore a gene may belong to more than one family/group.")
331330
);
332-
}
333331

334-
if(!empty($r[$h['gene_family']])){
335-
$s = $r[$h['gene_family']];
336-
parent::AddRDF(
337-
parent::triplifyString($id_res, $this->getVoc()."gene-family-description", utf8_encode(htmlspecialchars($s))).
338-
parent::describeProperty($this->getVoc()."gene-family-description", "gene family name","Name given to a particular gene family. The gene family description has an associated gene family tag. Gene families are used to group genes according to either sequence similarity or information from publications, specialist advisors for that family or other databases. Families/groups may be either structural or functional, therefore a gene may belong to more than one family/group.")
339-
);
332+
$gf_ids = $r[$h['gene_family_id']];
333+
$gf_des = $r[$h['gene_family']];
334+
335+
$_gf_ids = explode("|", $gf_ids);
336+
$_gf_des = explode("|", $gf_des);
337+
foreach ($_gf_ids as $i => $gf_id) {
338+
$gf_res = $this->getRes().$uid."_gf_res_$gf_id";
339+
#print_r($_gf_des);
340+
if(isset($_gf_des[$i])) {
341+
$gf_description = utf8_encode(htmlspecialchars($_gf_des[$i]));
342+
} else $gf_description = "";
343+
parent::addRDF(
344+
parent::triplifyString($id, $this->getVoc()."gene-family", $gf_res).
345+
parent::describeIndividual($gf_res, $gf_description, parent::getVoc()."Gene-Family" ))
346+
;
347+
}
348+
349+
#echo parent::getRDF();exit;
350+
340351
}
341352
//write RDF to file
342353
$this->WriteRDFBufferToWriteFile();

omim/omim.php

+46-69
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ class OMIMParser extends Bio2RDFizer
3333
function __construct($argv) {
3434
parent::__construct($argv, 'omim');
3535
parent::addParameter('files',true,null,'all|omim#','entries to process: comma-separated list or hyphen-separated range');
36-
parent::addParameter('omim_api_url',false,null,'http://api.omim.org/api/entry?include=all&format=json');
36+
parent::addParameter('omim_api_url',false,null,'https://api.omim.org/api/entry?include=all&format=json');
3737
parent::addParameter('omim_api_key',false,null);
3838
parent::addParameter('omim_api_key_file',false,null,'omim.key','A file containing your omim KEY');
3939
parent::initialize();
@@ -55,22 +55,38 @@ function Run()
5555
}
5656
} else {
5757
trigger_error("No OMIM key has been provided either by commmand line or in the expected omim key file $key_file",E_USER_WARNING);
58+
exit;
5859
}
5960
}
6061

61-
// get the list of mim2gene entries
62-
$entries = $this->GetListOfEntries($ldir);
62+
// get the list of entries
63+
$file = "mimTitles.txt";
64+
$rfile = "https://data.omim.org/downloads/$key/$file";
65+
$lfile = $ldir.$file;
66+
if(!file_exists($lfile) && parent::getParameterValue('download') == false) {
67+
trigger_error($lfile." not found. Will attempt to download.", E_USER_NOTICE);
68+
parent::setParameterValue('download',true);
69+
}
70+
if(parent::getParameterValue('download') == true) {
71+
echo "downloading $file ... ";
72+
Utils::DownloadSingle($rfile, $lfile);
73+
}
74+
// parse the file
75+
$fp = fopen($lfile,"rb");
76+
while($l = fgetcsv($fp,0,"\t")) {
77+
if($l[0][0] == "#") continue;
78+
$full_list[ $l[1] ] = "";
79+
}
6380

81+
6482
// get the work specified
6583
$list = trim(parent::getParameterValue('files'));
6684
if($list != 'all') {
6785
// check if a hyphenated list was provided
6886
if(($pos = strpos($list,"-")) !== FALSE) {
6987
$start_range = substr($list,0,$pos);
7088
$end_range = substr($list,$pos+1);
71-
72-
// get the whole list
73-
$full_list = $this->GetListOfEntries($ldir);
89+
7490
// now intersect
7591
foreach($full_list AS $e => $type) {
7692
if($e >= $start_range && $e <= $end_range) {
@@ -84,9 +100,11 @@ function Run()
84100
foreach($b AS $e) {
85101
$myentries[$e] = '';
86102
}
87-
$entries = array_intersect_key ($entries,$myentries);
103+
$entries = array_intersect_key ($full_list,$myentries);
88104
}
89-
}
105+
} else $entries = $full_list;
106+
107+
echo "Will process a total of ".count($entries)." OMIM entries".PHP_EOL;
90108

91109
// set the write file
92110
$gz = (strstr(parent::getParameterValue('output_format'),".gz") === FALSE)?false:true;
@@ -102,17 +120,13 @@ function Run()
102120
$total = count($entries);
103121
foreach($entries AS $omim_id => $type) {
104122
echo "processing ".(++$i)." of $total - omim# ";
105-
$download_file = $ldir.$omim_id.".json.gz";
106-
$gzfile = "compress.zlib://$download_file";
123+
$lfile = $ldir.$omim_id.".json.gz";
124+
$gzfile = "compress.zlib://$lfile";
107125
// download if the file doesn't exist or we are told to
108-
if(!file_exists($download_file) || parent::getParameterValue('download') == true) {
126+
if(!file_exists($lfile) || parent::getParameterValue('download') == true) {
109127
// download using the api
110-
$url = parent::getParameterValue('omim_api_url').'&apiKey='.parent::getParameterValue('omim_api_key').'&mimNumber='.$omim_id;
111-
$buf = file_get_contents($url);
112-
if(strlen($buf) != 0) {
113-
file_put_contents($download_file, $buf);
114-
usleep(500000); // limit of 4 requests per second
115-
}
128+
$rfile = parent::getParameterValue('omim_api_url').'&apiKey='.parent::getParameterValue('omim_api_key').'&mimNumber='.$omim_id;
129+
Utils::DownloadSingle($rfile, $lfile);
116130
}
117131

118132
// load entry, parse and write to file
@@ -170,57 +184,6 @@ function Run()
170184
return TRUE;
171185
}
172186

173-
function getListOfEntries($ldir)
174-
{
175-
// get the master list of entries
176-
$file = "mim2gene.txt";
177-
if(!file_exists($ldir.$file)) {
178-
trigger_error($ldir.$file." not found. Will attempt to download. ", E_USER_NOTICE);
179-
$this->SetParameterValue('download',true);
180-
}
181-
182-
if(parent::getParameterValue('download')==true) {
183-
// connect
184-
if(!isset($ftp)) {
185-
$host = 'ftp.omim.org';
186-
echo "connecting to $host ...";
187-
$ftp = ftp_connect($host);
188-
if(!$ftp) {
189-
echo "Unable to connect to $host".PHP_EOL;
190-
die;
191-
}
192-
ftp_pasv ($ftp, true) ;
193-
$login = ftp_login($ftp, 'anonymous', '[email protected]');
194-
if ((!$ftp) || (!$login)) {
195-
echo "FTP-connect failed!"; die;
196-
} else {
197-
echo "Connected".PHP_EOL;
198-
}
199-
}
200-
201-
// download
202-
ftp_pasv($ftp, true);
203-
echo "Downloading $file ...";
204-
if(ftp_get($ftp, $ldir.$file, 'OMIM/'.$file, FTP_BINARY) === FALSE) {
205-
trigger_error("Error in downloading $file");
206-
}
207-
if(isset($ftp)) ftp_close($ftp);
208-
echo "success!".PHP_EOL;
209-
}
210-
211-
// parse the mim2gene file for the entries
212-
// # Mim Number Type Gene IDs Approved Gene Symbols
213-
$fp = fopen($ldir.$file,"r");
214-
fgets($fp);
215-
while($l = fgets($fp)) {
216-
$a = explode("\t",$l);
217-
if($a[1] != "moved/removed")
218-
$list[$a[0]] = $a[1];
219-
}
220-
fclose($fp);
221-
return $list;
222-
}
223-
224187

225188
function get_phenotype_mapping_method_type($id = null, $generate_declaration = false)
226189
{
@@ -326,6 +289,14 @@ function ParseEntry($obj, $type)
326289
}
327290
}
328291

292+
// check if moved
293+
if(isset($o['movedTo'])) {
294+
$new_omim_uri = parent::getNamespace().$o['movedTo'];
295+
parent::addRDF(
296+
parent::triplify($omim_uri, parent::getVoc()."superceded-by", $new_omim_uri)
297+
);
298+
}
299+
329300
// parse text sections
330301
if(isset($o['textSectionList'])) {
331302
foreach($o['textSectionList'] AS $i => $section) {
@@ -542,7 +513,8 @@ function ParseEntry($obj, $type)
542513

543514
$ns = '';
544515
switch($k) {
545-
case 'approvedGeneSymbols': $ns = 'symbol';break;
516+
case 'hgncID': $ns = 'hgnc';break;
517+
case 'approvedGeneSymbols': $ns = 'hgnc.symbol';break;
546518
case 'geneIDs': $ns = 'ncbigene';break;
547519
case 'ncbiReferenceSequences': $ns = 'gi';break;
548520
case 'genbankNucleotideSequences': $ns = 'gi';break;
@@ -568,6 +540,11 @@ function ParseEntry($obj, $type)
568540
case 'diseaseOntologyIDs': $ns = 'do';break;
569541

570542
// specifically ignorning
543+
case 'newbornScreening':
544+
case 'clinGenDosage':
545+
case 'clinGenValidity':
546+
case 'monarch':
547+
case 'decipherSyndromes':
571548
case 'geneTests':
572549
case 'cmgGene':
573550
case 'geneticAllianceIDs': // #

0 commit comments

Comments
 (0)