-
Notifications
You must be signed in to change notification settings - Fork 72
/
Copy pathrun.py
143 lines (106 loc) · 4.69 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import argparse
from cleanvision import Imagelab
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Demonstrates how to use Imagelab")
parser.add_argument("--path", type=str, help="path to dataset", required=True)
args = parser.parse_args()
dataset_path = args.path
"""
Example 1
This example demonstrates the default CleanVision workflow to detect various types of issues in an image dataset.
"""
imagelab = Imagelab(data_path=dataset_path) # initialize imagelab
imagelab.list_default_issue_types() # list default checks
imagelab.visualize() # visualize sample images from the dataset
imagelab.find_issues() # Find issues in the dataset
imagelab.report()
print("Summary of all issues checks\n", imagelab.issue_summary.to_markdown())
imagelab.visualize(
issue_types=["blurry"], num_images=8
) # visualize images that have specific issues
# Get all images with blurry issue type
blurry_images = imagelab.issues.query("is_blurry_issue")
# visualize the given image files
imagelab.visualize(image_files=blurry_images.index.tolist()[:4])
# Miscellaneous extra information about dataset and its issues
print(list(imagelab.info.keys()), "\n")
print(list(imagelab.info["statistics"].keys()))
print(imagelab.info["statistics"]["brightness"][:10])
"""
Example 2
This example demonstrates using CleanVision to:
1. Check data for specific types of issues
2. Incrementally detect additional types of issues with existing Imagelab
3. Specify non-default parameter to use when detecting a particular issue type (e.g. a different threshold)
4. Save and load Imagelab instance to file
5. Report only specific issue types
"""
imagelab = Imagelab(data_path=dataset_path)
issue_types = {"near_duplicates": {}}
imagelab.find_issues(issue_types)
imagelab.report()
imagelab.save(
"./results"
) # optional, just included to show how to save/load this as a file
# Check for additional types of issues using existing Imagelab
imagelab = Imagelab.load("./results", dataset_path)
issue_types = {"light": {}, "low_information": {}}
imagelab.find_issues(issue_types)
imagelab.report()
# Check for an issue with a different threshold
issue_types = {"dark": {"threshold": 0.2}}
imagelab.find_issues(issue_types)
imagelab.report(issue_types=issue_types.keys()) # report only specific issues
"""
Example 3
This example demonstrates using CleanVision to:
1. Check for all default issue types, overriding some parameters for a particular issue type from their default values.
2. Change the verbosity of generated report to see more details
3. Ignore issues occurring in more than x% of images in the dataset
4. Increase the size of images in the grid displayed by visualize
"""
imagelab = Imagelab(data_path=dataset_path)
imagelab.find_issues()
imagelab.report(["near_duplicates"])
issue_types = {"near_duplicates": {"hash_type": "phash"}}
imagelab.find_issues(issue_types)
imagelab.report(issue_types=issue_types.keys())
# Customize report and visualize
# Change verbosity
imagelab.report(verbosity=3)
# Report arg values here will overwrite verbosity defaults
# Find top examples suffering from issues that are not present in more than 1% of the dataset
imagelab.report(max_prevalence=0.01)
# Increase cell_size in the grid
imagelab.visualize(issue_types=["light"], num_images=8, cell_size=(3, 3))
"""
Example 4
Run CleanVision on a Hugging Face dataset
"""
from datasets import load_dataset
hf_dataset = load_dataset("cifar10", split="test")
imagelab = Imagelab(hf_dataset=hf_dataset, image_key="img")
imagelab.find_issues()
imagelab.report()
"""
Example 5
Run CleanVision on torchvision dataset
"""
from torchvision.datasets import CIFAR10
torch_dataset = CIFAR10("./", train=False, download=True)
imagelab = Imagelab(torchvision_dataset=torch_dataset)
imagelab.find_issues()
imagelab.report()
"""
Example 6
This example demonstrates creating your own custom issue and using CleanVision to detect this additional issue type, along with the default set of issues
"""
# Run imagelab on custom issue
from custom_issue_manager import CustomIssueManager
imagelab = Imagelab(data_path=dataset_path)
issue_name = CustomIssueManager.issue_name
imagelab.list_possible_issue_types()
issue_types = {issue_name: {}}
imagelab.find_issues(issue_types) # check for custom issue type
imagelab.find_issues() # also check for default issue types
imagelab.report()