clamsproject · marcverhagen · Jun 11, 2024 · Jun 10, 2024 · Jun 10, 2024 · Jun 10, 2024
diff --git a/README.md b/README.md
@@ -1,20 +1,31 @@
 # Spacy NLP Service
 
+
 ## Description
-The spaCy NLP wrapped as a CLAMS service, spaCy is distributed under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
 
-## User instruction
+The spaCy NLP module wrapped as a CLAMS service, spaCy is distributed under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
+
+
+## User instructions
 
-General user instructions for CLAMS apps is available at [CLAMS Apps documentation](https://apps.clams.ai/clamsapp).
+General user instructions for CLAMS apps are available at [CLAMS Apps documentation](https://apps.clams.ai/clamsapp).
 
 ### System requirements
 
-This requires Python 3.8 or higher. For local install of required Python modules see [requirements.txt](requirements.txt).
+This requires Python 3.8 or higher. For local installation of required Python modules see [requirements.txt](requirements.txt).
+
+#### Running as a local python program
+
+To test just the wrapping code without using a server use the following:
+
+```bash
+python app.py -t example-mmif.json out.json
+```
 
-#### Using as local python programe
+### Configurable runtime parameters
 
-Use `python app.py -t example-mmif.json out.json` just to test the wrapping code without using a server.
+For the full list of parameters, please refer to the app metadata from in the CLAMS App Directory at [https://apps.clams.ai/#spacy-wrapper](https://apps.clams.ai/#spacy-wrapper) or the [`metadata.py`](metadata.py) file in this repository.
 
-### Configurable runtime parameter
+### Input and output details
 
-For the full list of parameters, please refer to the app metadata from [CLAMS App Directory](https://apps.clams.ai/clamsapp/) or [`metadata.py`](metadata.py) file in this repository.
+The spaCy code will run on each text document in the input MMIF file. The file `example-input.json` has one text document in the top level *documents* property and two text documents in one of the views. The output file `example-output.json` has a new view for each of those text documents.
diff --git a/app.py b/app.py
@@ -1,29 +1,23 @@
 """
-DELETE THIS MODULE STRING AND REPLACE IT WITH A DESCRIPTION OF YOUR APP.
-
-app.py Template
-
-The app.py script does several things:
-- import the necessary code
-- create a subclass of ClamsApp that defines the metadata and provides a method to run the wrapped NLP tool
-- provide a way to run the code as a RESTful Flask service 
 
+Wrapper for the Python spaCy library.
 
 """
 
 import argparse
 import logging
 from typing import Union
 
-import spacy
+# Imports needed for Clams and MMIF. Non-NLP Clams applications will require
+# AnnotationTypes, NLP apps require Uri
 from clams import ClamsApp, Restifier
 from lapps.discriminators import Uri
 from mmif import Mmif, View, Annotation, Document, AnnotationTypes, DocumentTypes
-from spacy.tokens import Doc
 
+# spaCy-specific imports
+import spacy
+from spacy.tokens import Doc
 
-# Imports needed for Clams and MMIF.
-# Non-NLP Clams applications will require AnnotationTypes
 
 class SpacyWrapper(ClamsApp):
 
@@ -41,17 +35,14 @@ def _appmetadata(self):
         pass
 
     def _annotate(self, mmif: Union[str, dict, Mmif], **parameters) -> Mmif:
-        if type(mmif) == Mmif:
 
-            mmif_obj = mmif
-        else:
-            mmif_obj = Mmif(mmif)
+        mmif_obj = mmif if type(mmif) == Mmif else Mmif(mmif)
 
         for doc in mmif_obj.get_documents_by_type(DocumentTypes.TextDocument):
             in_doc = None
             tok_idx = {}
-            if 'pretokenizd' in parameters and parameters['pretokenized']:
-                for view in mmif_obj.get_Views_for_document(doc.id):
+            if parameters.get('pretokenized') is True:
+                for view in mmif_obj.get_views_for_document(doc.id):
                     if Uri.TOKEN in view.metadata.contains:
                         tokens = [token.get_property('text') for token in view.get_annotations(Uri.TOKEN)]
                         tok_idx = {i : f'{view.id}:{token.id}'
@@ -63,18 +54,18 @@ def _annotate(self, mmif: Union[str, dict, Mmif], **parameters) -> Mmif:
             if in_doc is None:
                 in_doc = doc.text_value if not doc.location else open(doc.location_path()).read()
                 in_doc = self.nlp(in_doc)
-            
+
             did = f'{doc.parent}:{doc.id}' if doc.parent else doc.id
             view = mmif.new_view()
             self.sign_view(view, parameters)
             for attype in (Uri.TOKEN, Uri.POS, Uri.LEMMA, Uri.NCHUNK, Uri.SENTENCE, Uri.NE):
                 view.new_contain(attype, document=did)
-            
+
             for n, tok in enumerate(in_doc):
                 a = view.new_annotation(Uri.TOKEN)
                 if n not in tok_idx:
                     a.add_property("start", tok.idx)
-                    a.add_property("end", tok.idx + len(tok_idx))
+                    a.add_property("end", tok.idx + len(tok))
                     tok_idx[n] = a.id
                 else:
                     a.add_property('targets', [tok_idx[n]])
@@ -91,12 +82,13 @@ def _annotate(self, mmif: Union[str, dict, Mmif], **parameters) -> Mmif:
                         a.add_property('category', segment.label_)
         return mmif_obj
 
+
 def _test(infile, outfile):
     """Run spacy on an input MMIF file. This bypasses the server and just pings
     the annotate() method on the SpacyWrapper class. Prints a summary of the views
     in the end result."""
     app = SpacyWrapper()
-    print(app.appmetadata(pretty=True))
+    #print(app.appmetadata(pretty=True))
     with open(infile) as fh_in, open(outfile, 'w') as fh_out:
         mmif_out_as_string = app.annotate(fh_in.read(), pretty=True)
         mmif_out = Mmif(mmif_out_as_string)
@@ -106,7 +98,9 @@ def _test(infile, outfile):
                   % (view.id, len(view.annotations), view.metadata['app']))
 
 
+
 if __name__ == "__main__":
+
     parser = argparse.ArgumentParser()
     parser.add_argument("--port", action="store", default="5000", help="set port to listen")
     parser.add_argument("--production", action="store_true", help="run gunicorn server")

diff --git a/example-mmif.json → example-input.json b/example-mmif.json → example-input.json
@@ -1,23 +1,23 @@
 {
   "metadata": {
-    "mmif": "http://mmif.clams.ai/0.4.0"
+    "mmif": "http://mmif.clams.ai/1.0.4"
   },
   "documents": [
     {
       "@type": "http://mmif.clams.ai/0.4.0/vocabulary/VideoDocument",
       "properties": {
         "id": "m1",
         "mime": "video/mpeg",
-        "location": "/var/archive/video/example-video.mp4"
+        "location": "file:///var/archive/video/example-video.mp4"
       }
     },
     {
       "@type": "http://mmif.clams.ai/0.4.0/vocabulary/TextDocument",
       "properties": {
         "id": "m2",
-	"text": {
-	  "@value": "Hello, this is Jim Lehrer with the NewsHour on PBS. In the nineteen eighties, barking dogs have increasingly become a problem in urban areas."
-	}
+        "text": {
+          "@value": "Hello, this is Jim Lehrer with the NewsHour on PBS. In the nineteen eighties, barking dogs have increasingly become a problem in urban areas."
+        }
       }
     }
   ],
@@ -28,8 +28,8 @@
         "app": "http://mmif.clams.ai/apps/east/0.2.1",
         "contains": {
           "http://mmif.clams.ai/0.4.0/vocabulary/BoundingBox": {
-            "unit": "pixels",
-            "document": "m1"
+            "document": "m1",
+            "unit": "pixels"
           }
         }
       },