-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathxmlparse.py
43 lines (27 loc) · 1.29 KB
/
xmlparse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import os
import xml.etree.cElementTree as ET
import re
import gzip
#import zipfile
#from io import BytesIO
#import xml.dom.minidom as minidom
#from xml.dom.minidom import parse, parseString
#directory = '/Users/cindyli/Documents/CS/Research/workspace-biomed/pubmeddata/basicversion/data'
titles = []
def getTitles(directory):
for member in os.listdir(directory):
if member.endswith(".gz"):
with gzip.open(directory + "/" + member, 'rb') as f:
art = f.read()
data = ET.fromstring(art)
for child in data.iter('PubmedArticle'):
title = child.find('./MedlineCitation/Article/ArticleTitle')
if not title is None:
if not title.text is None:
amatch = re.findall('.+(?:misdiagnosed as | masquerading as).+', title.text)
if amatch:
titles.extend(amatch)
getTitles('/Users/cindyli/Documents/CS/Research/workspace-biomed/pubmeddata/basicversion/data/pubmed/')
with open('/Users/cindyli/Documents/CS/Research/workspace-biomed/pubmeddata/basicversion/titles.txt', 'a') as file:
for elt in titles:
file.write(elt + "\n")