-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathnormalize.py
43 lines (37 loc) · 1.41 KB
/
normalize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import re
import os
def uncomment(fileName, content):
if fileName.endswith('.c'):
return re.sub(r'/\*.*?\*/', '', content,
flags=re.MULTILINE | re.DOTALL)
elif fileName.endswith(('.cpp', '.cc', '.h', '.hh', '.hxx', 'hpp')):
pass1 = re.sub(r'/\*.*?\*/', '', content,
flags=re.MULTILINE | re.DOTALL)
pass2 = re.sub(r'//.*', '', pass1)
return pass2
elif fileName.endswith('.py'):
pass1 = re.sub(r'#.*', '', content)
pass2 = re.sub(r'""".*?"""', '', pass1,
flags=re.MULTILINE | re.DOTALL)
return pass2
return content
def search_files(data_dirs, suffixes):
matches = [] # list of files to read
for data_dir in data_dirs:
for root, dirnames, filenames in os.walk(data_dir):
for filename in filenames:
if filename.endswith(tuple(suffixes)):
matches.append(os.path.join(root, filename))
return matches
def tokenize(fileName, retcontent=False):
allTokens = []
with open(fileName) as data:
content = uncomment(fileName, data.read())
for line in content.split('\n'):
allTokens += [token.strip()
for token in re.split(r'(\W+)', line)
if len(token.strip()) > 0]
if not retcontent:
return allTokens
else:
return allTokens, content