Merge pull request #81 from fossology/feat/nirjas/update-interface

feat(nirjas): Update interface with Nirjas 0.0.5 Reviewed-By: [email protected] Tested-By: [email protected]
fossology · Jan 21, 2021 · 89476ad · 89476ad
2 parents a147f95 + fdc5f15
commit 89476ad
Show file tree

Hide file tree

Showing 4 changed files with 58 additions and 56 deletions.
diff --git a/atarashi/libs/commentPreprocessor.py b/atarashi/libs/commentPreprocessor.py
@@ -20,65 +20,69 @@
 """
 
 import argparse
-from nirjas import extract
 import json
 import os
 import sys
 import re
 import string
 import tempfile
 
+from nirjas import extract as commentExtract, LanguageMapper
+
 __author__ = "Aman Jain"
 __email__ = "[email protected]"
 
 args = None
 
 def licenseComment(data):
-    list = ['source', 'free', 'under','use',  'copyright', 'grant', 'software', 'license','licence', 'agreement', 'distribute', 'redistribution', 'liability', 'rights', 'reserved', 'general', 'public', 'modify', 'modified', 'modification', 'permission','permitted' 'granted', 'distributed', 'notice', 'distribution', 'terms', 'freely', 'licensed', 'merchantibility','redistributed', 'see', 'read', '(c)', 'copying', 'legal', 'licensing', 'spdx']
-
-    MLmapCount, CSLmapCount, SLmapCount = [], [], []
-    comment = ""
-    tempCount = 0
-    for id, item in enumerate(data[0]["multi_line_comment"]):
-        count = 0
-        if 'spdx-license-identifier' in item['comment'].lower():
-            return item['comment']
-
-        for i in list:
-            if i in item['comment'].lower():
-                count+=1
-
-        if count > tempCount:
-            tempCount = count
-            comment = item['comment']
-
-    if "cont_single_line_comment" in data[0]:
-      for id, item in enumerate(data[0]["cont_single_line_comment"]):
-          count = 0
-          if 'spdx-license-identifier' in item['comment'].lower():
-              return item['comment']
-
-          for i in list:
-              if i in item['comment'].lower():
-                  count+=1
-          if count > tempCount:
-              tempCount = count
-              comment = item['comment']
-
-    if "single_line_comment" in data[0]:
-      for id, item in enumerate(data[0]["single_line_comment"]):
-          count = 0
-          if 'spdx-license-identifier' in item['comment'].lower():
-              return item['comment']
-
-          for i in list:
-              if i in item['comment'].lower():
-                  count+=1
-          if count > tempCount:
-              tempCount = count
-              comment = item['comment']
-
-    return comment
+  match_list = ['source', 'free', 'under','use',  'copyright', 'grant', 'software', 'license','licence', 'agreement', 'distribute', 'redistribution', 'liability', 'rights', 'reserved', 'general', 'public', 'modify', 'modified', 'modification', 'permission','permitted' 'granted', 'distributed', 'notice', 'distribution', 'terms', 'freely', 'licensed', 'merchantibility','redistributed', 'see', 'read', '(c)', 'copying', 'legal', 'licensing', 'spdx']
+
+  MLmapCount, CSLmapCount, SLmapCount = [], [], []
+  comment = ""
+  tempCount = 0
+  if "multi_line_comment" in data:
+    for id, item in enumerate(data["multi_line_comment"]):
+      count = 0
+      if 'spdx-license-identifier' in item['comment'].lower():
+        return item['comment']
+
+      for i in match_list:
+        if i in item['comment'].lower():
+          count+=1
+
+      if count > tempCount:
+        tempCount = count
+        comment = item['comment']
+
+  if "cont_single_line_comment" in data:
+    for id, item in enumerate(data["cont_single_line_comment"]):
+      count = 0
+      if 'spdx-license-identifier' in item['comment'].lower():
+        return item['comment']
+
+      for i in match_list:
+        if i in item['comment'].lower():
+          count+=1
+
+      if count > tempCount:
+        tempCount = count
+        comment = item['comment']
+
+  if "single_line_comment" in data:
+    for id, item in enumerate(data["single_line_comment"]):
+      count = 0
+      if 'spdx-license-identifier' in item['comment'].lower():
+        return item['comment']
+
+      for i in match_list:
+        if i in item['comment'].lower():
+          count+=1
+
+      if count > tempCount:
+        tempCount = count
+        comment = item['comment']
+
+  return comment
 
 
 class CommentPreprocessor(object):
@@ -114,17 +118,15 @@ def extract(inputFile):
     :return: Temp file path from the OS
     '''
 
-    supportedFileExtensions = ['.py','.m4','.nsi','.c','.h','.cs','.cpp','.sep','.hxx','.cc','.css','.go','.hs','.html',
-                  '.xml','.java','.js','.kt','.kts','.ktm','.m','.php','.pl','.r','.R','.rb','.rs','.sh','.swift','.scala',
-                  '.sc','.txt','.lic','.install','.OSS','.gl']
-
+    supportedFileExtensions = list(LanguageMapper.LANG_MAP.keys())
+
     fd, outputFile = tempfile.mkstemp()
     fileType = os.path.splitext(inputFile)[1]
 
     with open(outputFile, 'w') as outFile:
       # if the file extension is supported
       if fileType in supportedFileExtensions:
-        data_file = extract(inputFile)
+        data_file = commentExtract(inputFile)
         data = json.loads(data_file)
         data1 = licenseComment(data)
         outFile.write(data1)

diff --git a/pyproject.toml b/pyproject.toml
@@ -9,6 +9,6 @@ requires = [
   "scipy>=0.18.1",
   "textdistance>=3.0.3",
   "pyxDamerauLevenshtein>=1.5",
-  "nirjas>=0.0.3",
+  "nirjas>=0.0.5",
   "urllib3>=1.24.1"
 ]
diff --git a/requirements.txt b/requirements.txt
@@ -6,5 +6,5 @@ scipy>=0.18.1
 spacy>=2.0.11
 textdistance>=3.0.3
 setuptools>=39.2.0
-nirjas>=0.0.3
-urllib3>=1.24.1
+nirjas>=0.0.5
+urllib3>=1.24.1
diff --git a/setup.py b/setup.py
@@ -55,7 +55,7 @@ def read(fname):
   'tqdm>=4.23.4',
   'pandas>=0.23.1',
   'urllib3>=1.24.1',
-  'nirjas>=0.0.3'
+  'nirjas>=0.0.5'
 ]
 
 requirements = [
@@ -68,7 +68,7 @@ def read(fname):
   'textdistance>=3.0.3',
   'pyxDamerauLevenshtein>=1.5',
   'urllib3>=1.24.1',
-  'nirjas>=0.0.3'
+  'nirjas>=0.0.5'
 ]
 
 class BuildAtarashiDependencies(distutils.cmd.Command):