-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_multimodal_pipelines.py
96 lines (83 loc) · 4.09 KB
/
test_multimodal_pipelines.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os.path as osp
PDF_FILE_PATH = osp.abspath(
osp.join(osp.dirname(__file__), "assets", "Multimodal_sample_file.pdf"))
class TestMultimodalPipelines:
"""Tests for pipeline transformations."""
def test_pipeline(self,):
"""Tests for pipeline"""
from clarifai_datautils.multimodal import Pipeline
from clarifai_datautils.multimodal.pipeline.cleaners import Clean_extra_whitespace
from clarifai_datautils.multimodal.pipeline.PDF import PDFPartitionMultimodal
pipeline = Pipeline(
name='pipeline-1',
transformations=[
PDFPartitionMultimodal(chunking_strategy="by_title", max_characters=1024),
Clean_extra_whitespace(),
])
assert pipeline.name == 'pipeline-1'
assert len(pipeline.transformations) == 2
def test_pipeline_run(self,):
"""Tests for pipeline run"""
from clarifai_datautils.multimodal import Pipeline
from clarifai_datautils.multimodal.pipeline.cleaners import Clean_extra_whitespace
from clarifai_datautils.multimodal.pipeline.extractors import ExtractEmailAddress
from clarifai_datautils.multimodal.pipeline.PDF import PDFPartitionMultimodal
pipeline = Pipeline(
name='pipeline-1',
transformations=[
PDFPartitionMultimodal(chunking_strategy="by_title", max_characters=1024),
Clean_extra_whitespace(),
ExtractEmailAddress()
])
elements = pipeline.run(files=PDF_FILE_PATH, loader=False)
assert len(elements) == 14
assert isinstance(elements, list)
assert elements[0].metadata.to_dict()['filename'] == 'Multimodal_sample_file.pdf'
assert elements[0].metadata.to_dict()['page_number'] == 1
assert elements[0].metadata.to_dict()['email_address'] == ['[email protected]']
assert elements[6].__class__.__name__ == 'Table'
assert elements[-1].__class__.__name__ == 'Image'
def test_pipeline_run_loader(self,):
"""Tests for pipeline run with loader"""
from clarifai_datautils.multimodal import Pipeline
from clarifai_datautils.multimodal.pipeline.cleaners import Clean_extra_whitespace
from clarifai_datautils.multimodal.pipeline.extractors import ExtractEmailAddress
from clarifai_datautils.multimodal.pipeline.PDF import PDFPartitionMultimodal
pipeline = Pipeline(
name='pipeline-1',
transformations=[
PDFPartitionMultimodal(chunking_strategy="by_title", max_characters=1024),
Clean_extra_whitespace(),
ExtractEmailAddress()
])
elements = pipeline.run(files=PDF_FILE_PATH, loader=True)
assert elements.__class__.__name__ == 'MultiModalLoader'
assert len(elements) == 14
assert elements.elements[0].metadata.to_dict()['filename'] == 'Multimodal_sample_file.pdf'
def test_pipeline_summarize(self,):
"""Tests for pipeline run with summarizer"""
import os
from clarifai_datautils.multimodal import Pipeline
from clarifai_datautils.multimodal.pipeline.cleaners import Clean_extra_whitespace
from clarifai_datautils.multimodal.pipeline.PDF import PDFPartitionMultimodal
from clarifai_datautils.multimodal.pipeline.summarizer import ImageSummarizer
pipeline = Pipeline(
name='pipeline-1',
transformations=[
PDFPartitionMultimodal(chunking_strategy="by_title", max_characters=1024),
Clean_extra_whitespace(),
ImageSummarizer(pat=os.environ.get("CLARIFAI_PAT"))
])
elements = pipeline.run(files=PDF_FILE_PATH, loader=False)
assert len(elements) == 16
assert isinstance(elements, list)
assert elements[0].metadata.to_dict()['filename'] == 'Multimodal_sample_file.pdf'
assert elements[0].metadata.to_dict()['page_number'] == 1
assert elements[6].__class__.__name__ == 'Table'
assert elements[-3].__class__.__name__ == 'Image'
assert elements[-3].metadata.is_original is True
assert elements[-3].metadata.input_id is not None
id = elements[-3].metadata.input_id
assert elements[-1].__class__.__name__ == 'CompositeElement'
assert elements[-1].metadata.is_original is False
assert elements[-1].metadata.source_input_id == id