forked from AntoniaBK/webpage-feature-extraction
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
95 lines (84 loc) · 3.48 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import os
import queue
import csv
import json
import sys
from typing import Any
from feature_extractor.zip_processor import ZipProcessor
from feature_extractor.feature_extractor import FeatureExtractor
def make_observation(path:str, tags:list[str] = []) -> dict[str, Any]|None:
processor = ZipProcessor(path)
processor.set_tags(tags)
extractor = FeatureExtractor(processor)
try:
features = extractor.extract_all_features()
except Exception as e:
print(f"Error: {e} in {path}")
features = None
finally:
processor.delete_extracted_folder()
return features
def remove_useless_tags(l:list[str])-> list[str]:
useless_tags = ["captures", "data", "tests", "additional_captures"]
for t in useless_tags:
if t in l:
l.remove(t)
return list(set(l))
def main():
#capture_dir = "tests/captures"
capture_dir = 'data/captures/additional_captures'
#capture_dir = 'data/captures/selected_captures'
#capture_dir = 'data/captures/ovh'
all_rows = {}
name = os.path.basename(capture_dir)
tags = []
json_file = f'data/output/data_{name}_duplicates.json'
csv_file = f'data/output/data_{name}_duplicates.csv'
structural_hash_count = {}
# Test if the extraction works and get fieldnames
test_observation = make_observation("tests/captures/parking-page/dan.zip")
if test_observation:
fieldnames = test_observation.keys()
else:
print("Something is not working. Exiting.")
sys.exit(1)
folder_queue = queue.Queue()
folder_queue.put(capture_dir)
mode = 'w' # if not os.path.exists(csv_file) else 'a'
with open(csv_file, mode, newline='', encoding='utf-8') as file:
writer = csv.DictWriter(file, fieldnames=fieldnames)
if mode == 'w':
writer.writeheader()
# Go through folders to get all the captures
while not folder_queue.empty():
capture_dir = folder_queue.get()
if os.path.isdir(capture_dir):
tags = remove_useless_tags(capture_dir.split("/"))
print(tags)
for capture_file in os.listdir(capture_dir):
if capture_file.endswith('.zip'):
try:
observation = make_observation(os.path.join(capture_dir, capture_file), tags)
except Exception as e:
print(e)
continue
if observation:
hash = observation['structural_hash']
if hash in structural_hash_count.keys():
structural_hash_count[hash] += 1
else:
structural_hash_count[hash] = 1
if structural_hash_count[hash] > 1:
writer.writerow(observation)
all_rows[observation['uuid']] = observation
else:
pass # what should be done with the duplicated ones?
else:
# !there should not be extracted captures...
folder_queue.put(os.path.join(capture_dir, capture_file))
with open(json_file, 'w') as file:
json.dump(all_rows, file)
print(len(fieldnames))
print(len(all_rows))
if __name__ == "__main__":
main()