11import json
2- import types
3- import unittest .mock
2+ import re
43from pathlib import Path
5- from typing import Any , Dict , Iterable , List , Optional , Union
4+ from typing import Dict , Iterable , List , Optional , Tuple , Union
65
76import avro .schema
87import click
@@ -66,29 +65,89 @@ def load_schemas(schemas_path: str) -> Dict[str, dict]:
6665 return schemas
6766
6867
69- def merge_schemas (schemas_obj : List [Any ]) -> str :
70- # Combine schemas.
71- merged = ["null" ] + schemas_obj
68+ def patch_schemas (schemas : Dict [str , dict ], pdl_path : Path ) -> Dict [str , dict ]:
69+ # We can easily find normal urn types using the generated avro schema,
70+ # but for arrays of urns there's nothing in the avro schema and hence
71+ # we have to look in the PDL files instead.
72+ urn_arrays : Dict [
73+ str , List [Tuple [str , str ]]
74+ ] = {} # schema name -> list of (field name, type)
7275
73- # Patch add_name method to NOT complain about duplicate names
74- def add_name ( self , name_attr , space_attr , new_schema ):
75- to_add = avro . schema . Name ( name_attr , space_attr , self . default_namespace )
76+ # First, we need to load the PDL files and find all urn arrays.
77+ for pdl_file in Path ( pdl_path ). glob ( "**/*.pdl" ):
78+ pdl_text = pdl_file . read_text ( )
7679
77- self .names [to_add .fullname ] = new_schema
78- return to_add
80+ # TRICKY: We assume that all urn types end with "Urn".
81+ arrays = re .findall (
82+ r"^\s*(\w+)\s*:\s*(?:optional\s+)?array\[(\w*Urn)\]" ,
83+ pdl_text ,
84+ re .MULTILINE ,
85+ )
86+ if arrays :
87+ schema_name = pdl_file .stem
88+ urn_arrays [schema_name ] = [(item [0 ], item [1 ]) for item in arrays ]
7989
80- with unittest .mock .patch ("avro.schema.Names.add_name" , add_name ):
81- cleaned_schema = avro .schema .make_avsc_object (merged )
90+ # Then, we can patch each schema.
91+ patched_schemas = {}
92+ for name , schema in schemas .items ():
93+ patched_schemas [name ] = patch_schema (schema , urn_arrays )
8294
83- # Convert back to an Avro schema JSON representation.
84- class MappingProxyEncoder (json .JSONEncoder ):
85- def default (self , obj ):
86- if isinstance (obj , types .MappingProxyType ):
87- return dict (obj )
88- return json .JSONEncoder .default (self , obj )
95+ return patched_schemas
96+
97+
98+ def patch_schema (schema : dict , urn_arrays : Dict [str , List [Tuple [str , str ]]]) -> dict :
99+ """
100+ This method patches the schema to add an "Urn" property to all urn fields.
101+ Because the inner type in an array is not a named Avro schema, for urn arrays
102+ we annotate the array field and add an "urn_is_array" property.
103+ """
104+
105+ # We're using Names() to generate a full list of embedded schemas.
106+ all_schemas = avro .schema .Names ()
107+ patched = avro .schema .make_avsc_object (schema , names = all_schemas )
108+
109+ for nested in all_schemas .names .values ():
110+ if isinstance (nested , (avro .schema .EnumSchema , avro .schema .FixedSchema )):
111+ continue
112+ assert isinstance (nested , avro .schema .RecordSchema )
113+
114+ # Patch normal urn types.
115+ field : avro .schema .Field
116+ for field in nested .fields :
117+ java_class : Optional [str ] = field .props .get ("java" , {}).get ("class" )
118+ if java_class and java_class .startswith (
119+ "com.linkedin.pegasus2avro.common.urn."
120+ ):
121+ field .set_prop ("Urn" , java_class .split ("." )[- 1 ])
122+
123+ # Patch array urn types.
124+ if nested .name in urn_arrays :
125+ mapping = urn_arrays [nested .name ]
126+
127+ for field_name , type in mapping :
128+ field = nested .fields_dict [field_name ]
129+ field .set_prop ("Urn" , type )
130+ field .set_prop ("urn_is_array" , True )
131+
132+ return patched .to_json ()
133+
134+
135+ def merge_schemas (schemas_obj : List [dict ]) -> str :
136+ # Combine schemas as a "union" of all of the types.
137+ merged = ["null" ] + schemas_obj
138+
139+ # Patch add_name method to NOT complain about duplicate names.
140+ class NamesWithDups (avro .schema .Names ):
141+ def add_name (self , name_attr , space_attr , new_schema ):
142+ to_add = avro .schema .Name (name_attr , space_attr , self .default_namespace )
143+ self .names [to_add .fullname ] = new_schema
144+ return to_add
145+
146+ cleaned_schema = avro .schema .make_avsc_object (merged , names = NamesWithDups ())
89147
148+ # Convert back to an Avro schema JSON representation.
90149 out_schema = cleaned_schema .to_json ()
91- encoded = json .dumps (out_schema , cls = MappingProxyEncoder , indent = 2 )
150+ encoded = json .dumps (out_schema , indent = 2 )
92151 return encoded
93152
94153
@@ -149,11 +208,11 @@ def add_avro_python3_warning(filepath: Path) -> None:
149208import functools
150209import pathlib
151210
211+ @functools.lru_cache(maxsize=None)
152212def _load_schema(schema_name: str) -> str:
153213 return (pathlib.Path(__file__).parent / f"{schema_name}.avsc").read_text()
154214"""
155215individual_schema_method = """
156- @functools.lru_cache(maxsize=None)
157216def get{schema_name}Schema() -> str:
158217 return _load_schema("{schema_name}")
159218"""
@@ -165,6 +224,17 @@ def make_load_schema_methods(schemas: Iterable[str]) -> str:
165224 )
166225
167226
227+ def save_raw_schemas (schema_save_dir : Path , schemas : Dict [str , dict ]) -> None :
228+ # Save raw avsc files.
229+ schema_save_dir .mkdir ()
230+ for name , schema in schemas .items ():
231+ (schema_save_dir / f"{ name } .avsc" ).write_text (json .dumps (schema , indent = 2 ))
232+
233+ # Add getXSchema methods.
234+ with open (schema_save_dir / "__init__.py" , "w" ) as schema_dir_init :
235+ schema_dir_init .write (make_load_schema_methods (schemas .keys ()))
236+
237+
168238def annotate_aspects (aspects : List [dict ], schema_class_file : Path ) -> None :
169239 schema_classes_lines = schema_class_file .read_text ().splitlines ()
170240 line_lookup_table = {line : i for i , line in enumerate (schema_classes_lines )}
@@ -177,9 +247,9 @@ def annotate_aspects(aspects: List[dict], schema_class_file: Path) -> None:
177247 ] += """
178248
179249class _Aspect(DictWrapper):
180- ASPECT_NAME: str = None # type: ignore
181- ASPECT_TYPE: str = "default"
182- ASPECT_INFO: dict = None # type: ignore
250+ ASPECT_NAME: ClassVar[ str] = None # type: ignore
251+ ASPECT_TYPE: ClassVar[ str] = "default"
252+ ASPECT_INFO: ClassVar[ dict] = None # type: ignore
183253
184254 def __init__(self):
185255 if type(self) is _Aspect:
@@ -225,16 +295,18 @@ def get_aspect_info(cls) -> dict:
225295 schema_classes_lines [
226296 empty_line
227297 ] += f"\n ASPECT_TYPE = '{ aspect ['Aspect' ]['type' ]} '"
228- schema_classes_lines [empty_line ] += f"\n ASPECT_INFO = { aspect ['Aspect' ]} "
298+
299+ aspect_info = {
300+ k : v for k , v in aspect ["Aspect" ].items () if k not in {"name" , "type" }
301+ }
302+ schema_classes_lines [empty_line ] += f"\n ASPECT_INFO = { aspect_info } "
229303
230304 schema_classes_lines [empty_line + 1 ] += "\n "
231305
232306 # Finally, generate a big list of all available aspects.
233307 newline = "\n "
234308 schema_classes_lines .append (
235309 f"""
236- from typing import Type
237-
238310ASPECT_CLASSES: List[Type[_Aspect]] = [
239311 { f',{ newline } ' .join (f"{ aspect ['name' ]} Class" for aspect in aspects )}
240312]
@@ -252,14 +324,22 @@ def get_aspect_info(cls) -> dict:
252324@click .argument (
253325 "entity_registry" , type = click .Path (exists = True , dir_okay = False ), required = True
254326)
327+ @click .argument (
328+ "pdl_path" , type = click .Path (exists = True , file_okay = False ), required = True
329+ )
255330@click .argument (
256331 "schemas_path" , type = click .Path (exists = True , file_okay = False ), required = True
257332)
258333@click .argument ("outdir" , type = click .Path (), required = True )
259- def generate (entity_registry : str , schemas_path : str , outdir : str ) -> None :
334+ def generate (
335+ entity_registry : str , pdl_path : str , schemas_path : str , outdir : str
336+ ) -> None :
260337 entities = load_entity_registry (Path (entity_registry ))
261338 schemas = load_schemas (schemas_path )
262339
340+ # Patch the avsc files.
341+ schemas = patch_schemas (schemas , Path (pdl_path ))
342+
263343 # Special handling for aspects.
264344 aspects = {
265345 schema ["Aspect" ]["name" ]: schema
@@ -288,8 +368,8 @@ def generate(entity_registry: str, schemas_path: str, outdir: str) -> None:
288368
289369 # Check for unused aspects. We currently have quite a few.
290370 # unused_aspects = set(aspects.keys()) - set().union(
291- # {entity.keyAspect for entity in entities},
292- # *(set(entity.aspects) for entity in entities),
371+ # {entity.keyAspect for entity in entities},
372+ # *(set(entity.aspects) for entity in entities),
293373 # )
294374
295375 merged_schema = merge_schemas (list (schemas .values ()))
@@ -303,17 +383,17 @@ def generate(entity_registry: str, schemas_path: str, outdir: str) -> None:
303383 Path (outdir ) / "schema_classes.py" ,
304384 )
305385
306- # Save raw schema files in codegen as well.
386+ # Keep a copy of a few raw avsc files.
387+ required_avsc_schemas = {"MetadataChangeEvent" , "MetadataChangeProposal" }
307388 schema_save_dir = Path (outdir ) / "schemas"
308- schema_save_dir .mkdir ()
309- for schema_out_file , schema in schemas .items ():
310- (schema_save_dir / f"{ schema_out_file } .avsc" ).write_text (
311- json .dumps (schema , indent = 2 )
312- )
313-
314- # Add load_schema method.
315- with open (schema_save_dir / "__init__.py" , "a" ) as schema_dir_init :
316- schema_dir_init .write (make_load_schema_methods (schemas .keys ()))
389+ save_raw_schemas (
390+ schema_save_dir ,
391+ {
392+ name : schema
393+ for name , schema in schemas .items ()
394+ if name in required_avsc_schemas
395+ },
396+ )
317397
318398 # Add headers for all generated files
319399 generated_files = Path (outdir ).glob ("**/*.py" )
0 commit comments