-
Notifications
You must be signed in to change notification settings - Fork 54
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix: _get_folder_size fn #471
base: main
Are you sure you want to change the base?
Changes from all commits
09e4978
712b2a7
0c5c849
cc95737
c457e74
2a764c6
012be9f
08e0015
fef180b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -182,7 +182,10 @@ def _maybe_delete_chunks(self) -> None: | |
def _can_delete_chunk(self) -> bool: | ||
if self._delete_chunks_when_processed: | ||
return self._pre_download_counter >= self._max_pre_download - 1 | ||
return self._max_cache_size is not None and _get_folder_size(self._parent_cache_dir) >= self._max_cache_size | ||
return ( | ||
self._max_cache_size is not None | ||
and _get_folder_size(self._parent_cache_dir, self._config) >= self._max_cache_size | ||
) | ||
|
||
def _pre_load_chunk(self, chunk_index: int) -> None: | ||
chunk_filepath, _, _ = self._config[ChunkedIndex(index=-1, chunk_index=chunk_index)] | ||
|
@@ -432,17 +435,17 @@ def __del__(self) -> None: | |
self._prepare_thread = None | ||
|
||
|
||
def _get_folder_size(path: str) -> int: | ||
def _get_folder_size(path: str, config: ChunksConfig) -> int: | ||
"""Collect the size of each files within a folder. | ||
|
||
This method is robust to file deletion races | ||
|
||
""" | ||
size = 0 | ||
for dirpath, _, filenames in os.walk(str(path)): | ||
for filename in filenames: | ||
for filename in os.listdir(os.path.join(path, "cache_dir")): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We shouldn't add cache_dir there. |
||
if filename in config.filename_to_size_map: | ||
with contextlib.suppress(FileNotFoundError): | ||
size += os.stat(os.path.join(dirpath, filename)).st_size | ||
size += config.filename_to_size_map[filename] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's check if filename is within the map, otherwise, this would fail the thread. Normally, the files should be. We need to print a warning if it doesn't. |
||
return size | ||
|
||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We shouldn't use the
parent_dir
anymore. Otherwise, this would always be empty. Each StreamingDataset should take care only its own cache.