Skip to content

Commit de5c7ea

Browse files
anakin87sjrl
andauthored
feat: add py.typed; adjust Component protocol (#9329)
* experimenting with py.typed * try changing run method in protocol * Trigger Build * better docstring + release note * remove type:ignore where possible * Removed a few more type: ignores --------- Co-authored-by: Sebastian Husch Lee <[email protected]>
1 parent 4ce6934 commit de5c7ea

File tree

6 files changed

+53
-31
lines changed

6 files changed

+53
-31
lines changed

e2e/pipelines/test_evaluation_pipeline.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -36,14 +36,14 @@ def indexing_pipeline(documents: List[Document]):
3636
doc_writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP)
3737
doc_embedder = SentenceTransformersDocumentEmbedder(model=EMBEDDINGS_MODEL, progress_bar=False)
3838
ingestion_pipe = Pipeline()
39-
ingestion_pipe.add_component(instance=doc_embedder, name="doc_embedder") # type: ignore
40-
ingestion_pipe.add_component(instance=doc_writer, name="doc_writer") # type: ignore
39+
ingestion_pipe.add_component(instance=doc_embedder, name="doc_embedder")
40+
ingestion_pipe.add_component(instance=doc_writer, name="doc_writer")
4141
ingestion_pipe.connect("doc_embedder.documents", "doc_writer.documents")
4242
ingestion_pipe.run({"doc_embedder": {"documents": documents}})
4343
return document_store
4444

4545

46-
def rag_pipeline(document_store: InMemoryDocumentStore, top_k: int): # type: ignore
46+
def rag_pipeline(document_store: InMemoryDocumentStore, top_k: int):
4747
"""RAG pipeline"""
4848
template = [
4949
ChatMessage.from_system(
@@ -59,11 +59,11 @@ def rag_pipeline(document_store: InMemoryDocumentStore, top_k: int): # type: ig
5959
),
6060
]
6161
rag = Pipeline()
62-
rag.add_component("embedder", SentenceTransformersTextEmbedder(model=EMBEDDINGS_MODEL, progress_bar=False)) # type: ignore
63-
rag.add_component("retriever", InMemoryEmbeddingRetriever(document_store, top_k=top_k)) # type: ignore
64-
rag.add_component("prompt_builder", ChatPromptBuilder(template=template)) # type: ignore
65-
rag.add_component("generator", OpenAIChatGenerator(model="gpt-4o-mini")) # type: ignore
66-
rag.add_component("answer_builder", AnswerBuilder()) # type: ignore
62+
rag.add_component("embedder", SentenceTransformersTextEmbedder(model=EMBEDDINGS_MODEL, progress_bar=False))
63+
rag.add_component("retriever", InMemoryEmbeddingRetriever(document_store, top_k=top_k))
64+
rag.add_component("prompt_builder", ChatPromptBuilder(template=template))
65+
rag.add_component("generator", OpenAIChatGenerator(model="gpt-4o-mini"))
66+
rag.add_component("answer_builder", AnswerBuilder())
6767
rag.connect("embedder", "retriever.query_embedding")
6868
rag.connect("retriever", "prompt_builder.documents")
6969
rag.connect("prompt_builder", "generator")

haystack/components/converters/multi_file_converter.py

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -80,24 +80,22 @@ def __init__(self, encoding: str = "utf-8", json_content_key: str = "content") -
8080
# Create pipeline and add components
8181
pp = Pipeline()
8282

83-
# We use type ignore here to avoid type checking errors
84-
# This is due to how the run method within the Component protocol is defined
85-
pp.add_component("router", router) # type: ignore[arg-type]
86-
pp.add_component("docx", DOCXToDocument(link_format="markdown")) # type: ignore[arg-type]
83+
pp.add_component("router", router)
84+
pp.add_component("docx", DOCXToDocument(link_format="markdown"))
8785
pp.add_component(
8886
"html",
89-
HTMLToDocument( # type: ignore[arg-type]
87+
HTMLToDocument(
9088
extraction_kwargs={"output_format": "markdown", "include_tables": True, "include_links": True}
9189
),
9290
)
93-
pp.add_component("json", JSONConverter(content_key=self.json_content_key)) # type: ignore[arg-type]
94-
pp.add_component("md", TextFileToDocument(encoding=self.encoding)) # type: ignore[arg-type]
95-
pp.add_component("text", TextFileToDocument(encoding=self.encoding)) # type: ignore[arg-type]
96-
pp.add_component("pdf", PyPDFToDocument()) # type: ignore[arg-type]
97-
pp.add_component("pptx", PPTXToDocument()) # type: ignore[arg-type]
98-
pp.add_component("xlsx", XLSXToDocument()) # type: ignore[arg-type]
99-
pp.add_component("joiner", DocumentJoiner()) # type: ignore[arg-type]
100-
pp.add_component("csv", CSVToDocument(encoding=self.encoding)) # type: ignore[arg-type]
91+
pp.add_component("json", JSONConverter(content_key=self.json_content_key))
92+
pp.add_component("md", TextFileToDocument(encoding=self.encoding))
93+
pp.add_component("text", TextFileToDocument(encoding=self.encoding))
94+
pp.add_component("pdf", PyPDFToDocument())
95+
pp.add_component("pptx", PPTXToDocument())
96+
pp.add_component("xlsx", XLSXToDocument())
97+
pp.add_component("joiner", DocumentJoiner())
98+
pp.add_component("csv", CSVToDocument(encoding=self.encoding))
10199

102100
for mime_type in ConverterMimeType:
103101
pp.connect(f"router.{mime_type.value}", str(mime_type).lower().rsplit(".", maxsplit=1)[-1])

haystack/components/preprocessors/document_preprocessor.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -127,10 +127,8 @@ def __init__( # noqa: PLR0913 (too-many-arguments)
127127
# Build the Pipeline
128128
pp = Pipeline()
129129

130-
# We use type ignore here to avoid type checking errors
131-
# This is due to how the run method within the Component protocol is defined
132-
pp.add_component("splitter", splitter) # type: ignore[arg-type]
133-
pp.add_component("cleaner", cleaner) # type: ignore[arg-type]
130+
pp.add_component("splitter", splitter)
131+
pp.add_component("cleaner", cleaner)
134132

135133
# Connect the splitter output to cleaner
136134
pp.connect("splitter.documents", "cleaner.documents")

haystack/core/component/component.py

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -160,12 +160,29 @@ def run(self, **kwargs):
160160
isinstance(MyComponent, Component)
161161
"""
162162

163-
# This is the most reliable way to define the protocol for the `run` method.
164-
# Defining a method doesn't work as different Components will have different
165-
# arguments. Even defining here a method with `**kwargs` doesn't work as the
166-
# expected signature must be identical.
167-
# This makes most Language Servers and type checkers happy and shows less errors.
168-
run: Callable[..., Dict[str, Any]]
163+
# The following expression defines a run method compatible with any input signature.
164+
# Its type is equivalent to Callable[..., Dict[str, Any]].
165+
# See https://typing.python.org/en/latest/spec/callables.html#meaning-of-in-callable.
166+
#
167+
# Using `run: Callable[..., Dict[str, Any]]` directly leads to type errors: the protocol would expect a settable
168+
# attribute `run`, while the actual implementation is a read-only method.
169+
# For example:
170+
# from haystack import Pipeline, component
171+
# @component
172+
# class MyComponent:
173+
# @component.output_types(out=str)
174+
# def run(self):
175+
# return {"out": "Hello, world!"}
176+
# pipeline = Pipeline()
177+
# pipeline.add_component("my_component", MyComponent())
178+
#
179+
# mypy raises:
180+
# error: Argument 2 to "add_component" of "PipelineBase" has incompatible type "MyComponent"; expected "Component"
181+
# [arg-type]
182+
# note: Protocol member Component.run expected settable variable, got read-only attribute
183+
184+
def run(self, *args: Any, **kwargs: Any) -> Dict[str, Any]: # pylint: disable=missing-function-docstring # noqa: D102
185+
...
169186

170187

171188
class ComponentMeta(type):

haystack/py.typed

Whitespace-only changes.
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
---
2+
upgrade:
3+
- |
4+
We've added a `py.typed` file to Haystack to enable type information to be used by downstream projects, in line
5+
with PEP 561. This means Haystack's type hints will now be visible to type checkers in projects that depend on it.
6+
Haystack is primarily type checked using mypy (not pyright) and, despite our efforts, some type information can
7+
be incomplete or unreliable.
8+
If you use static type checking in your own project, you may notice some changes: previously, Haystack's types were
9+
effectively treated as `Any`, but now actual type information will be available and enforced.

0 commit comments

Comments
 (0)