Skip to content

Commit b5bf388

Browse files
authored
Merge pull request #33 from clamsproject/develop
Bumping Python SDK version, bug fixes and documentation updates - Updated to clams-python 1.2.2 - Fixed token length (issue #30) - Fixed problems with the pretokenized parameter (issue #32) - Various documentation fixes.
2 parents ce95ecc + acbd5a4 commit b5bf388

7 files changed

+432
-748
lines changed

README.md

+19-8
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,31 @@
11
# Spacy NLP Service
22

3+
34
## Description
4-
The spaCy NLP wrapped as a CLAMS service, spaCy is distributed under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
55

6-
## User instruction
6+
The spaCy NLP module wrapped as a CLAMS service, spaCy is distributed under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
7+
8+
9+
## User instructions
710

8-
General user instructions for CLAMS apps is available at [CLAMS Apps documentation](https://apps.clams.ai/clamsapp).
11+
General user instructions for CLAMS apps are available at [CLAMS Apps documentation](https://apps.clams.ai/clamsapp).
912

1013
### System requirements
1114

12-
This requires Python 3.8 or higher. For local install of required Python modules see [requirements.txt](requirements.txt).
15+
This requires Python 3.8 or higher. For local installation of required Python modules see [requirements.txt](requirements.txt).
16+
17+
#### Running as a local python program
18+
19+
To test just the wrapping code without using a server use the following:
20+
21+
```bash
22+
python app.py -t example-mmif.json out.json
23+
```
1324

14-
#### Using as local python programe
25+
### Configurable runtime parameters
1526

16-
Use `python app.py -t example-mmif.json out.json` just to test the wrapping code without using a server.
27+
For the full list of parameters, please refer to the app metadata from in the CLAMS App Directory at [https://apps.clams.ai/#spacy-wrapper](https://apps.clams.ai/#spacy-wrapper) or the [`metadata.py`](metadata.py) file in this repository.
1728

18-
### Configurable runtime parameter
29+
### Input and output details
1930

20-
For the full list of parameters, please refer to the app metadata from [CLAMS App Directory](https://apps.clams.ai/clamsapp/) or [`metadata.py`](metadata.py) file in this repository.
31+
The spaCy code will run on each text document in the input MMIF file. The file `example-input.json` has one text document in the top level *documents* property and two text documents in one of the views. The output file `example-output.json` has a new view for each of those text documents.

app.py

+16-22
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,23 @@
11
"""
2-
DELETE THIS MODULE STRING AND REPLACE IT WITH A DESCRIPTION OF YOUR APP.
3-
4-
app.py Template
5-
6-
The app.py script does several things:
7-
- import the necessary code
8-
- create a subclass of ClamsApp that defines the metadata and provides a method to run the wrapped NLP tool
9-
- provide a way to run the code as a RESTful Flask service
102
3+
Wrapper for the Python spaCy library.
114
125
"""
136

147
import argparse
158
import logging
169
from typing import Union
1710

18-
import spacy
11+
# Imports needed for Clams and MMIF. Non-NLP Clams applications will require
12+
# AnnotationTypes, NLP apps require Uri
1913
from clams import ClamsApp, Restifier
2014
from lapps.discriminators import Uri
2115
from mmif import Mmif, View, Annotation, Document, AnnotationTypes, DocumentTypes
22-
from spacy.tokens import Doc
2316

17+
# spaCy-specific imports
18+
import spacy
19+
from spacy.tokens import Doc
2420

25-
# Imports needed for Clams and MMIF.
26-
# Non-NLP Clams applications will require AnnotationTypes
2721

2822
class SpacyWrapper(ClamsApp):
2923

@@ -41,17 +35,14 @@ def _appmetadata(self):
4135
pass
4236

4337
def _annotate(self, mmif: Union[str, dict, Mmif], **parameters) -> Mmif:
44-
if type(mmif) == Mmif:
4538

46-
mmif_obj = mmif
47-
else:
48-
mmif_obj = Mmif(mmif)
39+
mmif_obj = mmif if type(mmif) == Mmif else Mmif(mmif)
4940

5041
for doc in mmif_obj.get_documents_by_type(DocumentTypes.TextDocument):
5142
in_doc = None
5243
tok_idx = {}
53-
if 'pretokenizd' in parameters and parameters['pretokenized']:
54-
for view in mmif_obj.get_Views_for_document(doc.id):
44+
if parameters.get('pretokenized') is True:
45+
for view in mmif_obj.get_views_for_document(doc.id):
5546
if Uri.TOKEN in view.metadata.contains:
5647
tokens = [token.get_property('text') for token in view.get_annotations(Uri.TOKEN)]
5748
tok_idx = {i : f'{view.id}:{token.id}'
@@ -63,18 +54,18 @@ def _annotate(self, mmif: Union[str, dict, Mmif], **parameters) -> Mmif:
6354
if in_doc is None:
6455
in_doc = doc.text_value if not doc.location else open(doc.location_path()).read()
6556
in_doc = self.nlp(in_doc)
66-
57+
6758
did = f'{doc.parent}:{doc.id}' if doc.parent else doc.id
6859
view = mmif.new_view()
6960
self.sign_view(view, parameters)
7061
for attype in (Uri.TOKEN, Uri.POS, Uri.LEMMA, Uri.NCHUNK, Uri.SENTENCE, Uri.NE):
7162
view.new_contain(attype, document=did)
72-
63+
7364
for n, tok in enumerate(in_doc):
7465
a = view.new_annotation(Uri.TOKEN)
7566
if n not in tok_idx:
7667
a.add_property("start", tok.idx)
77-
a.add_property("end", tok.idx + len(tok_idx))
68+
a.add_property("end", tok.idx + len(tok))
7869
tok_idx[n] = a.id
7970
else:
8071
a.add_property('targets', [tok_idx[n]])
@@ -91,12 +82,13 @@ def _annotate(self, mmif: Union[str, dict, Mmif], **parameters) -> Mmif:
9182
a.add_property('category', segment.label_)
9283
return mmif_obj
9384

85+
9486
def _test(infile, outfile):
9587
"""Run spacy on an input MMIF file. This bypasses the server and just pings
9688
the annotate() method on the SpacyWrapper class. Prints a summary of the views
9789
in the end result."""
9890
app = SpacyWrapper()
99-
print(app.appmetadata(pretty=True))
91+
#print(app.appmetadata(pretty=True))
10092
with open(infile) as fh_in, open(outfile, 'w') as fh_out:
10193
mmif_out_as_string = app.annotate(fh_in.read(), pretty=True)
10294
mmif_out = Mmif(mmif_out_as_string)
@@ -106,7 +98,9 @@ def _test(infile, outfile):
10698
% (view.id, len(view.annotations), view.metadata['app']))
10799

108100

101+
109102
if __name__ == "__main__":
103+
110104
parser = argparse.ArgumentParser()
111105
parser.add_argument("--port", action="store", default="5000", help="set port to listen")
112106
parser.add_argument("--production", action="store_true", help="run gunicorn server")

example-mmif.json example-input.json

+7-7
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,23 @@
11
{
22
"metadata": {
3-
"mmif": "http://mmif.clams.ai/0.4.0"
3+
"mmif": "http://mmif.clams.ai/1.0.4"
44
},
55
"documents": [
66
{
77
"@type": "http://mmif.clams.ai/0.4.0/vocabulary/VideoDocument",
88
"properties": {
99
"id": "m1",
1010
"mime": "video/mpeg",
11-
"location": "/var/archive/video/example-video.mp4"
11+
"location": "file:///var/archive/video/example-video.mp4"
1212
}
1313
},
1414
{
1515
"@type": "http://mmif.clams.ai/0.4.0/vocabulary/TextDocument",
1616
"properties": {
1717
"id": "m2",
18-
"text": {
19-
"@value": "Hello, this is Jim Lehrer with the NewsHour on PBS. In the nineteen eighties, barking dogs have increasingly become a problem in urban areas."
20-
}
18+
"text": {
19+
"@value": "Hello, this is Jim Lehrer with the NewsHour on PBS. In the nineteen eighties, barking dogs have increasingly become a problem in urban areas."
20+
}
2121
}
2222
}
2323
],
@@ -28,8 +28,8 @@
2828
"app": "http://mmif.clams.ai/apps/east/0.2.1",
2929
"contains": {
3030
"http://mmif.clams.ai/0.4.0/vocabulary/BoundingBox": {
31-
"unit": "pixels",
32-
"document": "m1"
31+
"document": "m1",
32+
"unit": "pixels"
3333
}
3434
}
3535
},

0 commit comments

Comments
 (0)