Skip to content

Commit 7793ab8

Browse files
Improve performance find zip archive (#1664)
Co-authored-by: Martin Durant <[email protected]>
1 parent ee98ae3 commit 7793ab8

File tree

2 files changed

+382
-0
lines changed

2 files changed

+382
-0
lines changed

fsspec/implementations/tests/test_zip.py

+340
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
import collections.abc
22
import os.path
3+
from pathlib import Path
4+
from shutil import make_archive
35

46
import pytest
57

68
import fsspec
79
from fsspec.implementations.tests.test_archive import archive_data, tempzip
10+
from fsspec.implementations.zip import ZipFileSystem
811

912

1013
def test_info():
@@ -132,3 +135,340 @@ def test_append(m, tmpdir):
132135
fs.close()
133136

134137
assert len(fsspec.open_files("zip://*::memory://out.zip")) == 2
138+
139+
140+
@pytest.fixture(name="zip_file")
141+
def zip_file_fixture(tmp_path):
142+
data_dir = tmp_path / "data/"
143+
data_dir.mkdir()
144+
file1 = data_dir / "file1.txt"
145+
file1.write_text("Hello, World!")
146+
file2 = data_dir / "file2.txt"
147+
file2.write_text("Lorem ipsum dolor sit amet")
148+
149+
empty_dir = data_dir / "dir1"
150+
empty_dir.mkdir()
151+
152+
dir_with_files = data_dir / "dir2"
153+
dir_with_files.mkdir()
154+
file3 = dir_with_files / "file3.txt"
155+
file3.write_text("Hello!")
156+
157+
potential_mix_up_path = data_dir / "dir2startwithsamename.txt"
158+
potential_mix_up_path.write_text("Hello again!")
159+
160+
zip_file = tmp_path / "test"
161+
return Path(make_archive(zip_file, "zip", data_dir))
162+
163+
164+
def _assert_all_except_context_dependent_variables(result, expected_result):
165+
for path in expected_result.keys():
166+
assert result[path]
167+
result_without_date_time = result[path].copy()
168+
result_without_date_time.pop("date_time")
169+
result_without_date_time.pop("_raw_time")
170+
result_without_date_time.pop("external_attr")
171+
result_without_date_time.pop("create_system")
172+
173+
expected_result_without_date_time = expected_result[path].copy()
174+
expected_result_without_date_time.pop("date_time")
175+
expected_result_without_date_time.pop("_raw_time")
176+
expected_result_without_date_time.pop("external_attr")
177+
expected_result_without_date_time.pop("create_system")
178+
assert result_without_date_time == expected_result_without_date_time
179+
180+
181+
def test_find_returns_expected_result_detail_true(zip_file):
182+
zip_file_system = ZipFileSystem(zip_file)
183+
184+
result = zip_file_system.find("/", detail=True)
185+
186+
expected_result = {
187+
"dir2/file3.txt": {
188+
"orig_filename": "dir2/file3.txt",
189+
"filename": "dir2/file3.txt",
190+
"date_time": (2024, 8, 16, 10, 46, 18),
191+
"compress_type": 8,
192+
"_compresslevel": None,
193+
"comment": b"",
194+
"extra": b"",
195+
"create_system": 3,
196+
"create_version": 20,
197+
"extract_version": 20,
198+
"reserved": 0,
199+
"flag_bits": 0,
200+
"volume": 0,
201+
"internal_attr": 0,
202+
"external_attr": 2175008768,
203+
"header_offset": 260,
204+
"CRC": 2636827734,
205+
"compress_size": 8,
206+
"file_size": 6,
207+
"_raw_time": 21961,
208+
"_end_offset": 312,
209+
"name": "dir2/file3.txt",
210+
"size": 6,
211+
"type": "file",
212+
},
213+
"file1.txt": {
214+
"orig_filename": "file1.txt",
215+
"filename": "file1.txt",
216+
"date_time": (2024, 8, 16, 10, 46, 18),
217+
"compress_type": 8,
218+
"_compresslevel": None,
219+
"comment": b"",
220+
"extra": b"",
221+
"create_system": 3,
222+
"create_version": 20,
223+
"extract_version": 20,
224+
"reserved": 0,
225+
"flag_bits": 0,
226+
"volume": 0,
227+
"internal_attr": 0,
228+
"external_attr": 2175008768,
229+
"header_offset": 139,
230+
"CRC": 3964322768,
231+
"compress_size": 15,
232+
"file_size": 13,
233+
"_raw_time": 21961,
234+
"_end_offset": 193,
235+
"name": "file1.txt",
236+
"size": 13,
237+
"type": "file",
238+
},
239+
"file2.txt": {
240+
"orig_filename": "file2.txt",
241+
"filename": "file2.txt",
242+
"date_time": (2024, 8, 16, 10, 46, 18),
243+
"compress_type": 8,
244+
"_compresslevel": None,
245+
"comment": b"",
246+
"extra": b"",
247+
"create_system": 3,
248+
"create_version": 20,
249+
"extract_version": 20,
250+
"reserved": 0,
251+
"flag_bits": 0,
252+
"volume": 0,
253+
"internal_attr": 0,
254+
"external_attr": 2175008768,
255+
"header_offset": 193,
256+
"CRC": 1596576865,
257+
"compress_size": 28,
258+
"file_size": 26,
259+
"_raw_time": 21961,
260+
"_end_offset": 260,
261+
"name": "file2.txt",
262+
"size": 26,
263+
"type": "file",
264+
},
265+
}
266+
267+
_assert_all_except_context_dependent_variables(result, expected_result)
268+
269+
270+
def test_find_returns_expected_result_detail_false(zip_file):
271+
zip_file_system = ZipFileSystem(zip_file)
272+
273+
result = zip_file_system.find("/", detail=False)
274+
expected_result = [
275+
"dir2/file3.txt",
276+
"dir2startwithsamename.txt",
277+
"file1.txt",
278+
"file2.txt",
279+
]
280+
281+
assert result == expected_result
282+
283+
284+
def test_find_returns_expected_result_detail_true_include_dirs(zip_file):
285+
zip_file_system = ZipFileSystem(zip_file)
286+
287+
result = zip_file_system.find("/", detail=True, withdirs=True)
288+
expected_result = {
289+
"dir1": {
290+
"orig_filename": "dir1/",
291+
"filename": "dir1/",
292+
"date_time": (2024, 8, 16, 10, 54, 24),
293+
"compress_type": 0,
294+
"_compresslevel": None,
295+
"comment": b"",
296+
"extra": b"",
297+
"create_system": 3,
298+
"create_version": 20,
299+
"extract_version": 20,
300+
"reserved": 0,
301+
"flag_bits": 0,
302+
"volume": 0,
303+
"internal_attr": 0,
304+
"external_attr": 1106051088,
305+
"header_offset": 0,
306+
"CRC": 0,
307+
"compress_size": 0,
308+
"file_size": 0,
309+
"_raw_time": 22220,
310+
"_end_offset": 35,
311+
"name": "dir1",
312+
"size": 0,
313+
"type": "directory",
314+
},
315+
"dir2": {
316+
"orig_filename": "dir2/",
317+
"filename": "dir2/",
318+
"date_time": (2024, 8, 16, 10, 54, 24),
319+
"compress_type": 0,
320+
"_compresslevel": None,
321+
"comment": b"",
322+
"extra": b"",
323+
"create_system": 3,
324+
"create_version": 20,
325+
"extract_version": 20,
326+
"reserved": 0,
327+
"flag_bits": 0,
328+
"volume": 0,
329+
"internal_attr": 0,
330+
"external_attr": 1106051088,
331+
"header_offset": 35,
332+
"CRC": 0,
333+
"compress_size": 0,
334+
"file_size": 0,
335+
"_raw_time": 22220,
336+
"_end_offset": 70,
337+
"name": "dir2",
338+
"size": 0,
339+
"type": "directory",
340+
},
341+
"dir2/file3.txt": {
342+
"orig_filename": "dir2/file3.txt",
343+
"filename": "dir2/file3.txt",
344+
"date_time": (2024, 8, 16, 10, 54, 24),
345+
"compress_type": 8,
346+
"_compresslevel": None,
347+
"comment": b"",
348+
"extra": b"",
349+
"create_system": 3,
350+
"create_version": 20,
351+
"extract_version": 20,
352+
"reserved": 0,
353+
"flag_bits": 0,
354+
"volume": 0,
355+
"internal_attr": 0,
356+
"external_attr": 2175008768,
357+
"header_offset": 260,
358+
"CRC": 2636827734,
359+
"compress_size": 8,
360+
"file_size": 6,
361+
"_raw_time": 22220,
362+
"_end_offset": 312,
363+
"name": "dir2/file3.txt",
364+
"size": 6,
365+
"type": "file",
366+
},
367+
"file1.txt": {
368+
"orig_filename": "file1.txt",
369+
"filename": "file1.txt",
370+
"date_time": (2024, 8, 16, 10, 54, 24),
371+
"compress_type": 8,
372+
"_compresslevel": None,
373+
"comment": b"",
374+
"extra": b"",
375+
"create_system": 3,
376+
"create_version": 20,
377+
"extract_version": 20,
378+
"reserved": 0,
379+
"flag_bits": 0,
380+
"volume": 0,
381+
"internal_attr": 0,
382+
"external_attr": 2175008768,
383+
"header_offset": 139,
384+
"CRC": 3964322768,
385+
"compress_size": 15,
386+
"file_size": 13,
387+
"_raw_time": 22220,
388+
"_end_offset": 193,
389+
"name": "file1.txt",
390+
"size": 13,
391+
"type": "file",
392+
},
393+
"file2.txt": {
394+
"orig_filename": "file2.txt",
395+
"filename": "file2.txt",
396+
"date_time": (2024, 8, 16, 10, 54, 24),
397+
"compress_type": 8,
398+
"_compresslevel": None,
399+
"comment": b"",
400+
"extra": b"",
401+
"create_system": 3,
402+
"create_version": 20,
403+
"extract_version": 20,
404+
"reserved": 0,
405+
"flag_bits": 0,
406+
"volume": 0,
407+
"internal_attr": 0,
408+
"external_attr": 2175008768,
409+
"header_offset": 193,
410+
"CRC": 1596576865,
411+
"compress_size": 28,
412+
"file_size": 26,
413+
"_raw_time": 22220,
414+
"_end_offset": 260,
415+
"name": "file2.txt",
416+
"size": 26,
417+
"type": "file",
418+
},
419+
}
420+
421+
_assert_all_except_context_dependent_variables(result, expected_result)
422+
423+
424+
def test_find_returns_expected_result_detail_false_include_dirs(zip_file):
425+
zip_file_system = ZipFileSystem(zip_file)
426+
427+
result = zip_file_system.find("/", detail=False, withdirs=True)
428+
expected_result = [
429+
"dir1",
430+
"dir2",
431+
"dir2/file3.txt",
432+
"dir2startwithsamename.txt",
433+
"file1.txt",
434+
"file2.txt",
435+
]
436+
437+
assert result == expected_result
438+
439+
440+
def test_find_returns_expected_result_path_set(zip_file):
441+
zip_file_system = ZipFileSystem(zip_file)
442+
443+
result = zip_file_system.find("/dir2")
444+
expected_result = ["dir2/file3.txt"]
445+
446+
assert result == expected_result
447+
448+
449+
def test_find_with_and_without_slash_should_return_same_result(zip_file):
450+
zip_file_system = ZipFileSystem(zip_file)
451+
452+
assert zip_file_system.find("/dir2/") == zip_file_system.find("/dir2")
453+
454+
455+
def test_find_should_return_file_if_exact_match(zip_file):
456+
zip_file_system = ZipFileSystem(zip_file)
457+
458+
result = zip_file_system.find("/dir2startwithsamename.txt", detail=False)
459+
expected_result = ["dir2startwithsamename.txt"]
460+
461+
assert result == expected_result
462+
463+
464+
def test_find_returns_expected_result_recursion_depth_set(zip_file):
465+
zip_file_system = ZipFileSystem(zip_file)
466+
result = zip_file_system.find("/", maxdepth=1)
467+
468+
expected_result = [
469+
"dir2startwithsamename.txt",
470+
"file1.txt",
471+
"file2.txt",
472+
]
473+
474+
assert result == expected_result

0 commit comments

Comments
 (0)