Merge remote-tracking branch 'origin/1.0.7'

keighrim · keighrim · commit 555b3cc9a7ec · 2023-07-24T13:24:55.000-04:00
diff --git a/.dockerignore b/.dockerignore
@@ -1,13 +1,11 @@
 .git/
+.github/
 .gitignore
+.dockerignore
+Dockerfile
+Containerfile
+bin/
+build/
 *~
 **/*.pyc
-**/__pycache__
-*.md
-out*
-backup.sh*
-venv
-dbpedia-spotlight-1.0.0.jar
-entity-linking
-en.tar.gz
-en
+**/__pyche__
diff --git a/.github/for-clams-team.md b/.github/for-clams-team.md
@@ -0,0 +1,21 @@
+
+This directory contains GitHub-related files that help project management and the release process of CLAMS apps. 
+To use these workflows, your app must be part of `clamsproject` organization.
+To create a new repository under the `clamsproject` organization, here's some naming convention to follow.
+
+* App repositories in the `clamsproject` organization should be prefixed with `app-` (e.g., `app-myapp`).
+* An app that wraps an extant tool or application should be suffixed with `-wrapper` (e.g., `app-their-app-wrapper`).
+* `LICENSE` file should always contain licensing information of the terminal code. If the app is a wrapper, an additional file containing licensing information of the underlying tool must be placed next to the `LICENSE` file when the original license requires so.
+
+(Your "app name" that you used in `clams develop` to create this scaffolding doesn't have to match the repository name.)
+
+In the `workflows` directory, you'll find;
+
+* `issue-apps-project.yml`: this workflow will add all new issues and PRs to our [`apps` project board](https://github.com/orgs/clamsproject/projects/12).
+* `issue-assign.yml`: this workflow will assign an issue to the person who created a branch for the issue. A branch is for an issue when its name starts with the `issueNum-` prefix. (e.g., `3-fix` branch is for issue number 3)
+* `issue-close.yml`: this workflow will remove all assignee from closed/merged issues and PRs.
+* `publish.yml`: this workflow is the main driver for the app release process. A release process is set to be triggered by **any** tag push. To change the trigger edit `on:` part of the file. To change the trigger edit `on.push.tags` part of the file. ([reference](https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#running-your-workflow-only-when-a-push-of-specific-tags-occurs)). 
+  * The workflow will
+    1. build a container image for the app and push it to [the `clamsproject` ghcr](https://github.com/orgs/clamsproject/packages).
+    2. generate app directory entry files and create a PR to [the app directory repository](https://github.com/clamsproject/apps) for registration.
+  * **NOTE**: Throughout the entire release process, the git tag that triggered the workflow will be used as the version of the app.
diff --git a/.github/workflows/issue-apps-project.yml b/.github/workflows/issue-apps-project.yml
@@ -1,4 +1,4 @@
-name: "add to `apps` GH project when a new issue is submitted"
+name: "🗂 Add new issue to `apps` GHP"
 
 on:
   issues:
@@ -11,8 +11,9 @@ on:
 
 jobs:
   call-assign:
+    name: "🤙 Call GHP workflow"
     uses: clamsproject/.github/.github/workflows/repo-issue-project.yml@main
     secrets: inherit
     with:
-      projectnum: '12'
+      projectnum: 12
 
diff --git a/.github/workflows/issue-assign.yml b/.github/workflows/issue-assign.yml
@@ -1,10 +1,11 @@
-name: "assign an issue when issue branch is created"
+name: "🙆 Assign issue"
 
 on:
   create:
 
 jobs:
   call-assign:
     if: github.ref_type == 'branch'
+    name: "🤙 Call assignment workflow"
     uses: clamsproject/.github/.github/workflows/repo-issue-assign.yml@main
     secrets: inherit
diff --git a/.github/workflows/issue-close.yml b/.github/workflows/issue-close.yml
@@ -1,4 +1,4 @@
-name: "unassign all when an issue is closed"
+name: "🙅 Unassign assignees"
 
 on:
   issues:
@@ -10,5 +10,6 @@ on:
 
 jobs:
   call-unassign:
+    name: "🤙 Call unassignment workflow"
     uses: clamsproject/.github/.github/workflows/repo-issue-close.yml@main
     secrets: inherit
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -1,4 +1,4 @@
-name: "App Publisher: image>ghcr, metadata>appdir"
+name: "📦 Publish image>ghcr, metadata>appdir"
 
 on:
   workflow_dispatch:
@@ -14,33 +14,33 @@ on:
 jobs:
   set-version:
     runs-on: ubuntu-latest
-    name: 📌 Set VERSION value
+    name: "🏷 Set version value"
     outputs:
       version: ${{ steps.output_version.outputs.version }}
     steps:
-      - name: set VERSION value from dispatch inputs
+      - name: "📌 Set VERSION value from dispatch inputs"
         id: get_version_dispatch
         if: ${{ github.event_name == 'workflow_dispatch' }}
         run: echo "VERSION=${{ github.event.inputs.tag }}" >> $GITHUB_ENV
-      - name: set VERSION value from pushed tag
+      - name: "📌 Set VERSION value from pushed tag"
         id: get_version_tag
         if: ${{ github.event_name == 'push' }}
         run: echo "VERSION=$(echo "${{ github.ref }}" | cut -d/ -f3)" >> $GITHUB_ENV
-      - name: output result into an env-var
+      - name: "🏷 Record VERSION for follow-up jobs"
         id: output_version
         run: |
           echo "version=${{ env.VERSION }}" >> $GITHUB_OUTPUT
   publish-image:
     needs: ['set-version']
-    name: 🐳 Build and deploy to a container repository
+    name: "🤙 Call app container workflow"
     uses: clamsproject/.github/.github/workflows/app-container.yml@main
     secrets: inherit
     with:
       version: ${{ needs.set-version.outputs.version }}
       arm64: false
   register-appdir:
     needs: ['set-version', 'publish-image']
-    name: 📝 Register to CLASM app directory
+    name: "🤙 Call app registration workflow"
     uses: clamsproject/apps/.github/workflows/register.yml@main
     secrets: inherit
     with:
diff --git a/.gitignore b/.gitignore
@@ -162,10 +162,3 @@ dmypy.json
 # ctag generated file
 tags
 .tags
-
-/en.tar.gz
-/en
-/dbpedia-spotlight-1.0.0.jar
-/spotlight-model_lang=en.tar.gz
-/venv
-
diff --git a/Containerfile b/Containerfile
@@ -1,10 +1,32 @@
-FROM ghcr.io/clamsproject/clams-python:0.6.0
+# Use the same base image version as the clams-python python library version
+FROM ghcr.io/clamsproject/clams-python:1.0.7
+# See https://github.com/orgs/clamsproject/packages?tab=packages&q=clams-python for more base images
+# IF you want to automatically publish this image to the clamsproject organization, 
+# 1. you should have generated this template without --no-github-actions flag
+# 1. to add arm64 support, change relevant line in .github/workflows/container.yml 
+#     * NOTE that a lots of software doesn't install/compile or run on arm64 architecture out of the box 
+#     * make sure you locally test the compatibility of all software dependencies before using arm64 support 
+# 1. use a git tag to trigger the github action. You need to use git tag to properly set app version anyway
 
+################################################################################
+# DO NOT EDIT THIS SECTION
 ARG CLAMS_APP_VERSION
 ENV CLAMS_APP_VERSION ${CLAMS_APP_VERSION}
+################################################################################
 
+################################################################################
+# clams-python base images are based on debian distro
+# install more system packages as needed using the apt manager
+################################################################################
+
+################################################################################
+# main app installation
 COPY ./ /app
 WORKDIR /app
 RUN pip3 install -r requirements.txt
 
+RUN python3 -m spacy download en_core_web_sm
+
+# default command to run the CLAMS app in a production server 
 CMD ["python3", "app.py", "--production"]
+################################################################################
diff --git a/README.md b/README.md
@@ -1,4 +1,48 @@
-## User instruction
+# Spacy NLP Service
 
-General user instructions for CLAMS apps is available at [CLAMS Apps documentation](https://apps.clams.ai/clamsapp).
+The spaCy NLP tool wrapped as a CLAMS service, spaCy is distributed under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
 
+This requires Python 3.8 or higher. For local install of required Python modules see [requirements.txt](requirements.txt).
+
+## Using this service
+
+Use `python app.py -t example-mmif.json out.json` just to test the wrapping code without using a server. To test this using a server you run the app as a service in one terminal:
+
+```bash
+$ python app.py
+```
+
+And poke at it from another:
+
+```bash
+$ curl http://0.0.0.0:5000/
+$ curl -H "Accept: application/json" -X POST -d@example-mmif.json http://0.0.0.0:5000/
+```
+
+In CLAMS you usually run this in a container. To create an image
+
+```bash
+$ docker build -f Containerfile -t clams-spacy-wrapper .
+```
+
+And to run it as a container:
+
+```bash
+$ docker run --rm -d -p 5000:5000 clams-spacy-wrapper
+$ curl -H "Accept: application/json" -X POST -d@example-mmif.json http://0.0.0.0:5000/
+```
+
+The spaCy code will run on each text document in the input MMIF file. The file `example-mmif.json` has one text document in the top level `documents` property and two text documents in one of the views. The text documents all look as follows:
+
+```json
+{
+  "@type": "http://mmif.clams.ai/0.4.0/vocabulary/TextDocument",
+  "properties": {
+    "id": "m2",
+    "text": {
+      "@value": "Hello, this is Jim Lehrer with the NewsHour on PBS...."
+    }
+  }
+}
+```
+Instead of a `text:@value` property the text could in an external file, which would be given as a URI in the `location` property. See the readme file in [https://github.com/clamsproject/app-nlp-example](https://github.com/clamsproject/app-nlp-example) on how to do this.
diff --git a/app.py b/app.py
@@ -1,56 +1,58 @@
-"""app.py
-
-Wrapping Spacy NLP to extract tokens, tags, lemmas, sentences, chunks and named
-entities.
-
-Usage:
-
-$ python app.py -t example-mmif.json out.json
-$ python app.py [--develop]
-
-The first invocation is to just test the app without running a server. The
-second is to start a server, which you can ping with
+"""
+DELETE THIS MODULE STRING AND REPLACE IT WITH A DESCRIPTION OF YOUR APP.
 
-$ curl -H "Accept: application/json" -X POST -d@example-mmif.json http://0.0.0.0:5000/
+app.py Template
 
-With the --develop option you get a FLask server running in development mode,
-without it Gunicorn will be used for a more stable server.
+The app.py script does several things:
+- import the necessary code
+- create a subclass of ClamsApp that defines the metadata and provides a method to run the wrapped NLP tool
+- provide a way to run the code as a RESTful Flask service 
 
-Normally you would run this in a Docker container, see README.md.
 
 """
 
 import argparse
 from typing import Union
 
-import spacy
-from clams.app import ClamsApp
-from clams.restify import Restifier
+# Imports needed for Clams and MMIF.
+# Non-NLP Clams applications will require AnnotationTypes
+
+from clams import ClamsApp, Restifier
+from mmif import Mmif, View, Annotation, Document, AnnotationTypes, DocumentTypes
+
+# For an NLP tool we need to import the LAPPS vocabulary items
 from lapps.discriminators import Uri
-from mmif.serialize import Mmif
-from mmif.vocabulary import DocumentTypes
-from spacy.tokens import Doc
 
+# Spacy imports
+import spacy
+from spacy.tokens import Doc
 
 class SpacyWrapper(ClamsApp):
 
     def __init__(self):
         super().__init__()
-        # Load small English core model
+        # load small English core model
         self.nlp = spacy.load("en_core_web_sm")
 
     def _appmetadata(self):
+        # see metadata.py
         pass
 
     def _annotate(self, mmif: Union[str, dict, Mmif], **parameters) -> Mmif:
-        for doc in mmif.get_documents_by_type(DocumentTypes.TextDocument):
+        if type(mmif) == Mmif:
+
+            mmif_obj = mmif
+        else:
+            mmif_obj = Mmif(mmif)
+
+        for doc in mmif_obj.get_documents_by_type(DocumentTypes.TextDocument):
             in_doc = None
             tok_idx = {}
-            if 'pretokenized' in parameters and parameters['pretokenized']:
-                for view in mmif.get_views_for_document(doc.id):
+            if 'pretokenizd' in parameters and parameters['pretokenized']:
+                for view in mmif_obj.get_Views_for_document(doc.id):
                     if Uri.TOKEN in view.metadata.contains:
-                        tokens = [token.properties['text'] for token in view.get_annotations(Uri.TOKEN)]
-                        tok_idx = {i: f'{view.id}:{token.id}'
+                        tokens = [token.get_property('text') for token in view.get_annotations(Uri.TOKEN)]
+                        tok_idx = {i : f'{view.id}:{token.id}'
                                    for i, token in enumerate(view.get_annotations(Uri.TOKEN))}
                         in_doc = Doc(self.nlp.vocab, tokens)
                         self.nlp.add_pipe("sentencizer")
@@ -59,19 +61,18 @@ def _annotate(self, mmif: Union[str, dict, Mmif], **parameters) -> Mmif:
             if in_doc is None:
                 in_doc = doc.text_value if not doc.location else open(doc.location_path()).read()
                 in_doc = self.nlp(in_doc)
-
+            
             did = f'{doc.parent}:{doc.id}' if doc.parent else doc.id
             view = mmif.new_view()
-            self.sign_view(view)
-            for attype in (Uri.TOKEN, Uri.POS, Uri.LEMMA,
-                           Uri.NCHUNK, Uri.SENTENCE, Uri.NE):
+            self.sign_view(view, parameters)
+            for attype in (Uri.TOKEN, Uri.POS, Uri.LEMMA, Uri.NCHUNK, Uri.SENTENCE, Uri.NE):
                 view.new_contain(attype, document=did)
-
+            
             for n, tok in enumerate(in_doc):
                 a = view.new_annotation(Uri.TOKEN)
                 if n not in tok_idx:
-                    a.add_property('start', tok.idx)
-                    a.add_property('end', tok.idx + len(tok.text))
+                    a.add_property("start", tok.idx)
+                    a.add_property("end", tok.idx + len(tok_idx))
                     tok_idx[n] = a.id
                 else:
                     a.add_property('targets', [tok_idx[n]])
@@ -86,11 +87,9 @@ def _annotate(self, mmif: Union[str, dict, Mmif], **parameters) -> Mmif:
                     a.add_property('text', segment.text)
                     if segment.label_:
                         a.add_property('category', segment.label_)
+        return mmif_obj
 
-        return mmif
-
-
-def test(infile, outfile):
+def _test(infile, outfile):
     """Run spacy on an input MMIF file. This bypasses the server and just pings
     the annotate() method on the SpacyWrapper class. Prints a summary of the views
     in the end result."""
@@ -118,14 +117,16 @@ def test(infile, outfile):
     parsed_args = parser.parse_args()
 
     if parsed_args.test:
-        test(parsed_args.infile, parsed_args.outfile)
+        _test(parsed_args.infile, parsed_args.outfile)
     else:
         # create the app instance
         app = SpacyWrapper()
 
         http_app = Restifier(app, port=int(parsed_args.port)
         )
+        # for running the application in production mode
         if parsed_args.production:
             http_app.serve_production()
+        # development mode
         else:
             http_app.run()
diff --git a/metadata.py b/metadata.py
diff --git a/requirements.txt b/requirements.txt