Skip to content

Develop #33

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jun 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 19 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,20 +1,31 @@
# Spacy NLP Service


## Description
The spaCy NLP wrapped as a CLAMS service, spaCy is distributed under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).

## User instruction
The spaCy NLP module wrapped as a CLAMS service, spaCy is distributed under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).


## User instructions

General user instructions for CLAMS apps is available at [CLAMS Apps documentation](https://apps.clams.ai/clamsapp).
General user instructions for CLAMS apps are available at [CLAMS Apps documentation](https://apps.clams.ai/clamsapp).

### System requirements

This requires Python 3.8 or higher. For local install of required Python modules see [requirements.txt](requirements.txt).
This requires Python 3.8 or higher. For local installation of required Python modules see [requirements.txt](requirements.txt).

#### Running as a local python program

To test just the wrapping code without using a server use the following:

```bash
python app.py -t example-mmif.json out.json
```

#### Using as local python programe
### Configurable runtime parameters

Use `python app.py -t example-mmif.json out.json` just to test the wrapping code without using a server.
For the full list of parameters, please refer to the app metadata from in the CLAMS App Directory at [https://apps.clams.ai/#spacy-wrapper](https://apps.clams.ai/#spacy-wrapper) or the [`metadata.py`](metadata.py) file in this repository.

### Configurable runtime parameter
### Input and output details

For the full list of parameters, please refer to the app metadata from [CLAMS App Directory](https://apps.clams.ai/clamsapp/) or [`metadata.py`](metadata.py) file in this repository.
The spaCy code will run on each text document in the input MMIF file. The file `example-input.json` has one text document in the top level *documents* property and two text documents in one of the views. The output file `example-output.json` has a new view for each of those text documents.
38 changes: 16 additions & 22 deletions app.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,23 @@
"""
DELETE THIS MODULE STRING AND REPLACE IT WITH A DESCRIPTION OF YOUR APP.

app.py Template

The app.py script does several things:
- import the necessary code
- create a subclass of ClamsApp that defines the metadata and provides a method to run the wrapped NLP tool
- provide a way to run the code as a RESTful Flask service

Wrapper for the Python spaCy library.

"""

import argparse
import logging
from typing import Union

import spacy
# Imports needed for Clams and MMIF. Non-NLP Clams applications will require
# AnnotationTypes, NLP apps require Uri
from clams import ClamsApp, Restifier
from lapps.discriminators import Uri
from mmif import Mmif, View, Annotation, Document, AnnotationTypes, DocumentTypes
from spacy.tokens import Doc

# spaCy-specific imports
import spacy
from spacy.tokens import Doc

# Imports needed for Clams and MMIF.
# Non-NLP Clams applications will require AnnotationTypes

class SpacyWrapper(ClamsApp):

Expand All @@ -41,17 +35,14 @@ def _appmetadata(self):
pass

def _annotate(self, mmif: Union[str, dict, Mmif], **parameters) -> Mmif:
if type(mmif) == Mmif:

mmif_obj = mmif
else:
mmif_obj = Mmif(mmif)
mmif_obj = mmif if type(mmif) == Mmif else Mmif(mmif)

for doc in mmif_obj.get_documents_by_type(DocumentTypes.TextDocument):
in_doc = None
tok_idx = {}
if 'pretokenizd' in parameters and parameters['pretokenized']:
for view in mmif_obj.get_Views_for_document(doc.id):
if parameters.get('pretokenized') is True:
for view in mmif_obj.get_views_for_document(doc.id):
if Uri.TOKEN in view.metadata.contains:
tokens = [token.get_property('text') for token in view.get_annotations(Uri.TOKEN)]
tok_idx = {i : f'{view.id}:{token.id}'
Expand All @@ -63,18 +54,18 @@ def _annotate(self, mmif: Union[str, dict, Mmif], **parameters) -> Mmif:
if in_doc is None:
in_doc = doc.text_value if not doc.location else open(doc.location_path()).read()
in_doc = self.nlp(in_doc)

did = f'{doc.parent}:{doc.id}' if doc.parent else doc.id
view = mmif.new_view()
self.sign_view(view, parameters)
for attype in (Uri.TOKEN, Uri.POS, Uri.LEMMA, Uri.NCHUNK, Uri.SENTENCE, Uri.NE):
view.new_contain(attype, document=did)

for n, tok in enumerate(in_doc):
a = view.new_annotation(Uri.TOKEN)
if n not in tok_idx:
a.add_property("start", tok.idx)
a.add_property("end", tok.idx + len(tok_idx))
a.add_property("end", tok.idx + len(tok))
tok_idx[n] = a.id
else:
a.add_property('targets', [tok_idx[n]])
Expand All @@ -91,12 +82,13 @@ def _annotate(self, mmif: Union[str, dict, Mmif], **parameters) -> Mmif:
a.add_property('category', segment.label_)
return mmif_obj


def _test(infile, outfile):
"""Run spacy on an input MMIF file. This bypasses the server and just pings
the annotate() method on the SpacyWrapper class. Prints a summary of the views
in the end result."""
app = SpacyWrapper()
print(app.appmetadata(pretty=True))
#print(app.appmetadata(pretty=True))
with open(infile) as fh_in, open(outfile, 'w') as fh_out:
mmif_out_as_string = app.annotate(fh_in.read(), pretty=True)
mmif_out = Mmif(mmif_out_as_string)
Expand All @@ -106,7 +98,9 @@ def _test(infile, outfile):
% (view.id, len(view.annotations), view.metadata['app']))



if __name__ == "__main__":

parser = argparse.ArgumentParser()
parser.add_argument("--port", action="store", default="5000", help="set port to listen")
parser.add_argument("--production", action="store_true", help="run gunicorn server")
Expand Down
14 changes: 7 additions & 7 deletions example-mmif.json → example-input.json
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
{
"metadata": {
"mmif": "http://mmif.clams.ai/0.4.0"
"mmif": "http://mmif.clams.ai/1.0.4"
},
"documents": [
{
"@type": "http://mmif.clams.ai/0.4.0/vocabulary/VideoDocument",
"properties": {
"id": "m1",
"mime": "video/mpeg",
"location": "/var/archive/video/example-video.mp4"
"location": "file:///var/archive/video/example-video.mp4"
}
},
{
"@type": "http://mmif.clams.ai/0.4.0/vocabulary/TextDocument",
"properties": {
"id": "m2",
"text": {
"@value": "Hello, this is Jim Lehrer with the NewsHour on PBS. In the nineteen eighties, barking dogs have increasingly become a problem in urban areas."
}
"text": {
"@value": "Hello, this is Jim Lehrer with the NewsHour on PBS. In the nineteen eighties, barking dogs have increasingly become a problem in urban areas."
}
}
}
],
Expand All @@ -28,8 +28,8 @@
"app": "http://mmif.clams.ai/apps/east/0.2.1",
"contains": {
"http://mmif.clams.ai/0.4.0/vocabulary/BoundingBox": {
"unit": "pixels",
"document": "m1"
"document": "m1",
"unit": "pixels"
}
}
},
Expand Down
Loading
Loading