more descriptions in app metadata

keighrim · keighrim · commit 41848efc6d11 · 2024-04-22T13:58:57.000-04:00
diff --git a/metadata.py b/metadata.py
@@ -27,9 +27,13 @@ def appmetadata() -> AppMetadata:
                     'https://pypi.org/project/python-doctr . The model is capable of detecting text regions in the '
                     'input image and recognizing text in the regions. The text-localized regions are organized '
                     'hierarchically by the model into "pages" > "blocks" > "lines" > "words", and this CLAMS app '
-                    'translated into `TextDocument`, `Paragraphs`, `Sentence`, and `Token` annotations that represent '
-                    'recognized text contents, then aligned to `BoundingBox` annotations that represent the detected '
-                    'geometries.',
+                    'translates them into `TextDocument`, `Paragraphs`, `Sentence`, and `Token` annotations to '
+                    'represent recognized text contents, then aligns them to `BoundingBox` annotations that represent '
+                    'the detected geometries. This hierarchical structure is also represented in the `TextDocument` '
+                    'annotation output as two newlines (`\\n\\n`) between "paragraphs", one newline (`\\n`) between '
+                    'the "lines", and one space (" ") between the "words". For the text recognition, the model is '
+                    'internally configured to use the "parseq" recognition model, and only works with English text '
+                    'at the moment.',
         app_license="Apache 2.0",
         identifier="doctr-wrapper",
         url="https://github.com/clamsproject/app-doctr-wrapper",
@@ -42,17 +46,23 @@ def appmetadata() -> AppMetadata:
                           '`representatives` property is present, the app will process videos still frames at the '
                           'underlying time point annotations that are referred to by the `representatives` property. '
                           'Otherwise, the app will process the middle frame of the video segment.')
-    metadata.add_output(DocumentTypes.TextDocument)
-    out_sent = metadata.add_output(at_type=Uri.SENTENCE)
-    out_sent.add_description('Translation of the recognized "text lines" in the processed input images')
-    out_para = metadata.add_output(at_type=Uri.PARAGRAPH)
-    out_para.add_description('Translation of the recognized "text blocks" in the processed input images')
-    out_tkn = metadata.add_output(at_type=Uri.TOKEN)
-    out_tkn.add_description('Translation of the recognized "text words" in the processed input images')
+    out_td = metadata.add_output(DocumentTypes.TextDocument, **{'@lang': 'en'})
+    out_td.add_description('Fully serialized text content of the recognized text in the input images. Serialization is'
+                           'done by concatenating `text` values of `Paragraph` annotations with two newline characters.')
+    out_tkn = metadata.add_output(at_type=Uri.TOKEN, text='*', word='*')
+    out_tkn.add_description('Translation of the recognized docTR "words" in the input images. `text` and `word` '
+                            'properties store the string values of the recognized text. The duplication is for keeping'
+                            'backward compatibility and consistency with `Paragraph` and `Sentence` annotations.')
+    out_sent = metadata.add_output(at_type=Uri.SENTENCE, text='*')
+    out_sent.add_description('Translation of the recognized docTR "lines" in the input images. `text` property stores '
+                             'the string value of space-joined words.')
+    out_para = metadata.add_output(at_type=Uri.PARAGRAPH, text='*')
+    out_para.add_description('Translation of the recognized docTR "blocks" in the input images. `text` property stores '
+                             'the string value of newline-joined sentences.')
     out_ali = metadata.add_output(AnnotationTypes.Alignment)
     out_ali.add_description('Alignments between 1) `TimePoint` <-> `TextDocument`, 2) `TimePoint` <-> '
                             '`Token`/`Sentence`/`Paragraph`, 3) `BoundingBox` <-> `Token`/`Sentence`/`Paragraph`')
-    out_bbox = metadata.add_output(AnnotationTypes.BoundingBox)
+    out_bbox = metadata.add_output(AnnotationTypes.BoundingBox, label='text')
     out_bbox.add_description('Bounding boxes of the detected text regions in the input images. No corresponding box '
                              'for the entire image (`TextDocument`) region')