-
Notifications
You must be signed in to change notification settings - Fork 3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
refactor(ingest/s3): enhance readability #12609
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -847,7 +847,7 @@ | |||||
path_spec: PathSpec, | ||||||
bucket: "Bucket", | ||||||
prefix: str, | ||||||
) -> List[Folder]: | ||||||
) -> Iterable[Folder]: | ||||||
""" | ||||||
Retrieves all the folders in a path by listing all the files in the prefix. | ||||||
If the prefix is a full path then only that folder will be extracted. | ||||||
|
@@ -877,51 +877,30 @@ | |||||
s3_objects = ( | ||||||
obj | ||||||
for obj in bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE) | ||||||
if _is_allowed_path(path_spec, f"s3://{obj.bucket_name}/{obj.key}") | ||||||
if _is_allowed_path( | ||||||
path_spec, self.create_s3_path(obj.bucket_name, obj.key) | ||||||
) | ||||||
) | ||||||
|
||||||
partitions: List[Folder] = [] | ||||||
grouped_s3_objects_by_dirname = groupby_unsorted( | ||||||
s3_objects, | ||||||
key=lambda obj: obj.key.rsplit("/", 1)[0], | ||||||
) | ||||||
for key, group in grouped_s3_objects_by_dirname: | ||||||
file_size = 0 | ||||||
creation_time = None | ||||||
modification_time = None | ||||||
|
||||||
for item in group: | ||||||
file_size += item.size | ||||||
if creation_time is None or item.last_modified < creation_time: | ||||||
creation_time = item.last_modified | ||||||
if modification_time is None or item.last_modified > modification_time: | ||||||
modification_time = item.last_modified | ||||||
max_file = item | ||||||
|
||||||
if modification_time is None: | ||||||
logger.warning( | ||||||
f"Unable to find any files in the folder {key}. Skipping..." | ||||||
) | ||||||
continue | ||||||
Comment on lines
-901
to
-905
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove this(L901~ 905) because |
||||||
|
||||||
id = path_spec.get_partition_from_path( | ||||||
self.create_s3_path(max_file.bucket_name, max_file.key) | ||||||
for _, group in grouped_s3_objects_by_dirname: | ||||||
max_file = max(group, key=lambda x: x.last_modified) | ||||||
max_file_s3_path = self.create_s3_path(max_file.bucket_name, max_file.key) | ||||||
|
||||||
# If partition_id is None, it means the folder is not a partition | ||||||
partition_id = path_spec.get_partition_from_path(max_file_s3_path) | ||||||
|
||||||
yield Folder( | ||||||
partition_id=partition_id, | ||||||
is_partition=bool(partition_id), | ||||||
creation_time=max_file.last_modified, | ||||||
modification_time=max_file.last_modified, | ||||||
sample_file=max_file_s3_path, | ||||||
size=sum(obj.size for obj in group), | ||||||
) | ||||||
Comment on lines
+888
to
902
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The performance loss is minimal because the time complexity has only increased from O(n) to O(2n). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why O(2n)? because this calls max() and sum() |
||||||
|
||||||
# If id is None, it means the folder is not a partition | ||||||
partitions.append( | ||||||
Folder( | ||||||
partition_id=id, | ||||||
is_partition=bool(id), | ||||||
creation_time=creation_time if creation_time else None, # type: ignore[arg-type] | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We don't need null handling. The type of
|
||||||
modification_time=modification_time, | ||||||
sample_file=self.create_s3_path(max_file.bucket_name, max_file.key), | ||||||
size=file_size, | ||||||
) | ||||||
) | ||||||
|
||||||
return partitions | ||||||
|
||||||
def s3_browser(self, path_spec: PathSpec, sample_size: int) -> Iterable[BrowsePath]: | ||||||
if self.source_config.aws_config is None: | ||||||
raise ValueError("aws_config not set. Cannot browse s3") | ||||||
|
@@ -1000,7 +979,7 @@ | |||||
min=True, | ||||||
) | ||||||
dirs_to_process.append(dirs_to_process_min[0]) | ||||||
folders = [] | ||||||
folders: List[Folder] = [] | ||||||
for dir in dirs_to_process: | ||||||
logger.info(f"Getting files from folder: {dir}") | ||||||
prefix_to_process = urlparse(dir).path.lstrip("/") | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ref