diff --git a/atarashi/libs/commentPreprocessor.py b/atarashi/libs/commentPreprocessor.py index a208ba69..042da935 100644 --- a/atarashi/libs/commentPreprocessor.py +++ b/atarashi/libs/commentPreprocessor.py @@ -20,7 +20,6 @@ """ import argparse -from nirjas import extract import json import os import sys @@ -28,57 +27,62 @@ import string import tempfile +from nirjas import extract as commentExtract, LanguageMapper + __author__ = "Aman Jain" __email__ = "amanjain5221@gmail.com" args = None def licenseComment(data): - list = ['source', 'free', 'under','use', 'copyright', 'grant', 'software', 'license','licence', 'agreement', 'distribute', 'redistribution', 'liability', 'rights', 'reserved', 'general', 'public', 'modify', 'modified', 'modification', 'permission','permitted' 'granted', 'distributed', 'notice', 'distribution', 'terms', 'freely', 'licensed', 'merchantibility','redistributed', 'see', 'read', '(c)', 'copying', 'legal', 'licensing', 'spdx'] - - MLmapCount, CSLmapCount, SLmapCount = [], [], [] - comment = "" - tempCount = 0 - for id, item in enumerate(data[0]["multi_line_comment"]): - count = 0 - if 'spdx-license-identifier' in item['comment'].lower(): - return item['comment'] - - for i in list: - if i in item['comment'].lower(): - count+=1 - - if count > tempCount: - tempCount = count - comment = item['comment'] - - if "cont_single_line_comment" in data[0]: - for id, item in enumerate(data[0]["cont_single_line_comment"]): - count = 0 - if 'spdx-license-identifier' in item['comment'].lower(): - return item['comment'] - - for i in list: - if i in item['comment'].lower(): - count+=1 - if count > tempCount: - tempCount = count - comment = item['comment'] - - if "single_line_comment" in data[0]: - for id, item in enumerate(data[0]["single_line_comment"]): - count = 0 - if 'spdx-license-identifier' in item['comment'].lower(): - return item['comment'] - - for i in list: - if i in item['comment'].lower(): - count+=1 - if count > tempCount: - tempCount = count - comment = item['comment'] - - return comment + match_list = ['source', 'free', 'under','use', 'copyright', 'grant', 'software', 'license','licence', 'agreement', 'distribute', 'redistribution', 'liability', 'rights', 'reserved', 'general', 'public', 'modify', 'modified', 'modification', 'permission','permitted' 'granted', 'distributed', 'notice', 'distribution', 'terms', 'freely', 'licensed', 'merchantibility','redistributed', 'see', 'read', '(c)', 'copying', 'legal', 'licensing', 'spdx'] + + MLmapCount, CSLmapCount, SLmapCount = [], [], [] + comment = "" + tempCount = 0 + if "multi_line_comment" in data: + for id, item in enumerate(data["multi_line_comment"]): + count = 0 + if 'spdx-license-identifier' in item['comment'].lower(): + return item['comment'] + + for i in match_list: + if i in item['comment'].lower(): + count+=1 + + if count > tempCount: + tempCount = count + comment = item['comment'] + + if "cont_single_line_comment" in data: + for id, item in enumerate(data["cont_single_line_comment"]): + count = 0 + if 'spdx-license-identifier' in item['comment'].lower(): + return item['comment'] + + for i in match_list: + if i in item['comment'].lower(): + count+=1 + + if count > tempCount: + tempCount = count + comment = item['comment'] + + if "single_line_comment" in data: + for id, item in enumerate(data["single_line_comment"]): + count = 0 + if 'spdx-license-identifier' in item['comment'].lower(): + return item['comment'] + + for i in match_list: + if i in item['comment'].lower(): + count+=1 + + if count > tempCount: + tempCount = count + comment = item['comment'] + + return comment class CommentPreprocessor(object): @@ -114,17 +118,15 @@ def extract(inputFile): :return: Temp file path from the OS ''' - supportedFileExtensions = ['.py','.m4','.nsi','.c','.h','.cs','.cpp','.sep','.hxx','.cc','.css','.go','.hs','.html', - '.xml','.java','.js','.kt','.kts','.ktm','.m','.php','.pl','.r','.R','.rb','.rs','.sh','.swift','.scala', - '.sc','.txt','.lic','.install','.OSS','.gl'] - + supportedFileExtensions = list(LanguageMapper.LANG_MAP.keys()) + fd, outputFile = tempfile.mkstemp() fileType = os.path.splitext(inputFile)[1] with open(outputFile, 'w') as outFile: # if the file extension is supported if fileType in supportedFileExtensions: - data_file = extract(inputFile) + data_file = commentExtract(inputFile) data = json.loads(data_file) data1 = licenseComment(data) outFile.write(data1) diff --git a/pyproject.toml b/pyproject.toml index 716e93dc..bb885c70 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,6 @@ requires = [ "scipy>=0.18.1", "textdistance>=3.0.3", "pyxDamerauLevenshtein>=1.5", - "nirjas>=0.0.3", + "nirjas>=0.0.5", "urllib3>=1.24.1" ] diff --git a/requirements.txt b/requirements.txt index b90ce4a7..a19d9c4d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,5 +6,5 @@ scipy>=0.18.1 spacy>=2.0.11 textdistance>=3.0.3 setuptools>=39.2.0 -nirjas>=0.0.3 -urllib3>=1.24.1 \ No newline at end of file +nirjas>=0.0.5 +urllib3>=1.24.1 diff --git a/setup.py b/setup.py index 5177656b..37cab80d 100755 --- a/setup.py +++ b/setup.py @@ -55,7 +55,7 @@ def read(fname): 'tqdm>=4.23.4', 'pandas>=0.23.1', 'urllib3>=1.24.1', - 'nirjas>=0.0.3' + 'nirjas>=0.0.5' ] requirements = [ @@ -68,7 +68,7 @@ def read(fname): 'textdistance>=3.0.3', 'pyxDamerauLevenshtein>=1.5', 'urllib3>=1.24.1', - 'nirjas>=0.0.3' + 'nirjas>=0.0.5' ] class BuildAtarashiDependencies(distutils.cmd.Command):