|
1 | 1 | import os
|
2 | 2 | from typing import Callable, Any, Dict
|
3 | 3 |
|
| 4 | +try: |
| 5 | + from tqdm import tqdm as progress_bar |
| 6 | +except ImportError: |
| 7 | + |
| 8 | + def progress_bar(iterable, *args, **kwargs): |
| 9 | + return iterable |
| 10 | + |
| 11 | + |
4 | 12 | import numpy
|
5 | 13 |
|
6 | 14 | import deeplake
|
7 | 15 | from ._deeplake import *
|
8 | 16 |
|
9 |
| -__version__ = "4.0.3" |
| 17 | +__version__ = "4.1.0" |
10 | 18 |
|
11 | 19 | __all__ = [
|
12 | 20 | "__version__",
|
|
22 | 30 | "ColumnView",
|
23 | 31 | "Column",
|
24 | 32 | "Version",
|
25 |
| - "Prefetcher", |
26 | 33 | "DatasetView",
|
27 | 34 | "Dataset",
|
28 | 35 | "ReadOnlyDataset",
|
|
34 | 41 | "ColumnAlreadyExistsError",
|
35 | 42 | "ColumnDoesNotExistError",
|
36 | 43 | "InvalidColumnValueError",
|
| 44 | + "InvalidPolygonShapeError", |
| 45 | + "InvalidLinkDataError", |
37 | 46 | "PushError",
|
38 | 47 | "GcsStorageProviderFailed",
|
39 | 48 | "History",
|
|
42 | 51 | "LogNotexistsError",
|
43 | 52 | "IncorrectDeeplakePathError",
|
44 | 53 | "AuthenticationError",
|
| 54 | + "BadRequestError", |
45 | 55 | "AuthorizationError",
|
46 | 56 | "NotFoundError",
|
47 | 57 | "AgreementError",
|
|
56 | 66 | "InvalidChunkStrategyType",
|
57 | 67 | "InvalidSequenceOfSequence",
|
58 | 68 | "InvalidTypeAndFormatPair",
|
| 69 | + "InvalidLinkType", |
59 | 70 | "UnknownType",
|
60 | 71 | "InvalidTextType",
|
61 | 72 | "UnsupportedPythonType",
|
62 | 73 | "UnsupportedSampleCompression",
|
63 | 74 | "UnsupportedChunkCompression",
|
64 | 75 | "InvalidImageCompression",
|
65 |
| - "InvalidMaskCompression", |
| 76 | + "InvalidSegmentMaskCompression", |
| 77 | + "InvalidBinaryMaskCompression", |
66 | 78 | "DtypeMismatch",
|
67 | 79 | "UnspecifiedDtype",
|
68 | 80 | "DimensionsMismatch",
|
|
90 | 102 | "StorageInternalError",
|
91 | 103 | "WriteFailedError",
|
92 | 104 | "QuantizationType",
|
| 105 | + "InvalidCredsKeyAssignmentError", |
| 106 | + "CredsKeyAlreadyAssignedError", |
93 | 107 | "core",
|
94 | 108 | "create",
|
95 | 109 | "create_async",
|
|
122 | 136 |
|
123 | 137 | def _tensorflow(self) -> Any:
|
124 | 138 | from deeplake._tensorflow import _from_dataset
|
| 139 | + |
125 | 140 | return _from_dataset(self)
|
126 | 141 |
|
127 | 142 |
|
128 | 143 | def _pytorch(self, transform: Callable[[Any], Any] = None):
|
129 | 144 | from deeplake._torch import TorchDataset
|
| 145 | + |
130 | 146 | return TorchDataset(self, transform=transform)
|
131 | 147 |
|
132 | 148 |
|
133 | 149 | DatasetView.pytorch = _pytorch
|
134 | 150 | DatasetView.tensorflow = _tensorflow
|
135 | 151 |
|
| 152 | + |
136 | 153 | def load(*args, **kwargs):
|
137 | 154 | """
|
138 | 155 | .. deprecated:: 4.0.0
|
139 | 156 | """
|
140 |
| - raise Exception(""" |
| 157 | + raise Exception( |
| 158 | + """ |
141 | 159 | The API for Deep Lake 4.0 has changed significantly, including the `load` method being replaced by `open`.
|
142 | 160 | To continue using Deep Lake 3.x, use `pip install "deeplake<4"`.
|
143 | 161 | For information on migrating your code, see https://docs.deeplake.ai/latest/details/v3_conversion/
|
144 |
| - """.replace("\n", " ").strip()) |
| 162 | + """.replace( |
| 163 | + "\n", " " |
| 164 | + ).strip() |
| 165 | + ) |
| 166 | + |
145 | 167 |
|
146 | 168 | def empty(*args, **kwargs):
|
147 | 169 | """
|
148 | 170 | .. deprecated:: 4.0.0
|
149 | 171 | """
|
150 |
| - raise Exception(""" |
| 172 | + raise Exception( |
| 173 | + """ |
151 | 174 | The API for Deep Lake 4.0 has changed significantly, including the `empty` method being replaced by `create`.
|
152 | 175 | To continue using Deep Lake 3.x, use `pip install "deeplake<4"`.
|
153 | 176 | For information on migrating your code, see https://docs.deeplake.ai/latest/details/v3_conversion/
|
154 |
| - """.replace("\n", " ").strip()) |
| 177 | + """.replace( |
| 178 | + "\n", " " |
| 179 | + ).strip() |
| 180 | + ) |
| 181 | + |
155 | 182 |
|
156 | 183 | def convert(src: str, dst: str, dst_creds: Dict[str, str] = None):
|
157 | 184 | """
|
158 | 185 | Copies the v3 dataset at src into a new dataset in the new v4 format.
|
159 | 186 | """
|
160 | 187 |
|
| 188 | + def commit_data(dataset, message="Committing data"): |
| 189 | + dataset.commit() |
| 190 | + |
| 191 | + def get_raw_columns(source): |
| 192 | + return [ |
| 193 | + col.name |
| 194 | + for col in source.schema.columns |
| 195 | + if not col.dtype.is_link |
| 196 | + and col.dtype.kind |
| 197 | + in { |
| 198 | + deeplake.types.TypeKind.Image, |
| 199 | + deeplake.types.TypeKind.SegmentMask, |
| 200 | + deeplake.types.TypeKind.BinaryMask, |
| 201 | + } |
| 202 | + ] |
| 203 | + |
| 204 | + def transfer_non_link_data(source, dest, batch_size): |
| 205 | + dl = deeplake._deeplake._Prefetcher( |
| 206 | + source, |
| 207 | + batch_size=batch_size, |
| 208 | + adaptive=True, |
| 209 | + raw_columns=set(get_raw_columns(source)), |
| 210 | + ) |
| 211 | + for counter, batch in enumerate(progress_bar(dl), start=1): |
| 212 | + dest.append(batch) |
| 213 | + if counter % 100 == 0: |
| 214 | + commit_data(dest) |
| 215 | + commit_data(dest, "Final commit of non-link data") |
| 216 | + |
| 217 | + def transfer_with_links(source, dest, links, column_names, batch_size): |
| 218 | + iterable_cols = [col for col in column_names if col not in links] |
| 219 | + link_sample_info = {link: source[link]._links_info() for link in links} |
| 220 | + dest.set_creds_key(link_sample_info[links[0]]["key"]) |
| 221 | + pref_ds = source.query(f"SELECT {','.join(iterable_cols)}") |
| 222 | + dl = deeplake._deeplake._Prefetcher( |
| 223 | + pref_ds, |
| 224 | + batch_size=batch_size, |
| 225 | + adaptive=True, |
| 226 | + raw_columns=set(get_raw_columns(source)), |
| 227 | + ) |
| 228 | + |
| 229 | + for counter, batch in enumerate(progress_bar(dl), start=1): |
| 230 | + for link in links: |
| 231 | + link_data = link_sample_info[link]["data"] |
| 232 | + start_index = (counter - 1) * batch_size |
| 233 | + end_index = min((counter) * batch_size, len(link_data)) |
| 234 | + batch[link] = link_data[start_index:end_index] |
| 235 | + |
| 236 | + dest.append(batch) |
| 237 | + if counter % 100 == 0: |
| 238 | + commit_data(dest) |
| 239 | + commit_data(dest, "Final commit of linked data") |
| 240 | + |
161 | 241 | source_ds = deeplake.query(f'select * from "{src}"')
|
162 | 242 | dest_ds = deeplake.like(source_ds, dst, dst_creds)
|
163 |
| - dest_ds.commit("Created dataset") |
| 243 | + commit_data(dest_ds, "Created dataset") |
| 244 | + |
| 245 | + column_names = [col.name for col in source_ds.schema.columns] |
| 246 | + links = [ |
| 247 | + col.name |
| 248 | + for col in source_ds.schema.columns |
| 249 | + if source_ds.schema[col.name].dtype.is_link |
| 250 | + ] |
| 251 | + batch_size = 10000 |
164 | 252 |
|
165 |
| - dl = deeplake.Prefetcher(source_ds, batch_size=10000) |
166 |
| - counter = 0 |
167 | 253 | print(f"Transferring {len(source_ds)} rows to {dst}...")
|
168 |
| - for b in dl: |
169 |
| - dest_ds.append(b) |
170 |
| - counter += 1 |
171 |
| - if counter > 0 and counter % 100 == 0: |
172 |
| - dest_ds.commit() |
173 |
| - dest_ds.commit() |
174 |
| - print(f"Transferring data.... to {dst}... DONE") |
| 254 | + if not links: |
| 255 | + transfer_non_link_data(source_ds, dest_ds, batch_size) |
| 256 | + else: |
| 257 | + transfer_with_links(source_ds, dest_ds, links, column_names, batch_size) |
175 | 258 |
|
| 259 | + for column in column_names: |
| 260 | + meta = dict(source_ds[column].metadata) |
| 261 | + if meta: |
| 262 | + for key, value in meta.items(): |
| 263 | + dest_ds[column].metadata[key] = value |
| 264 | + |
| 265 | + commit_data(dest_ds, "Final commit of metadata") |
| 266 | + print(f"Data transfer to {dst} complete.") |
176 | 267 |
|
177 | 268 |
|
178 | 269 | def __register_at_fork():
|
179 | 270 | from ._deeplake import __prepare_atfork, __parent_atfork, __child_atfork
|
180 | 271 |
|
181 | 272 | UNSAFE_TYPES = (
|
182 |
| - Dataset, DatasetView, ReadOnlyDataset, Column, ColumnView, ColumnDefinition, ColumnDefinitionView, Row, RowView, |
183 |
| - RowRange, RowRangeView, Schema, SchemaView, Version, History, Prefetcher,Tag,Tags) |
| 273 | + Dataset, |
| 274 | + DatasetView, |
| 275 | + ReadOnlyDataset, |
| 276 | + Column, |
| 277 | + ColumnView, |
| 278 | + ColumnDefinition, |
| 279 | + ColumnDefinitionView, |
| 280 | + Row, |
| 281 | + RowView, |
| 282 | + RowRange, |
| 283 | + RowRangeView, |
| 284 | + Schema, |
| 285 | + SchemaView, |
| 286 | + Version, |
| 287 | + History, |
| 288 | + Tag, |
| 289 | + Tags, |
| 290 | + ) |
184 | 291 |
|
185 | 292 | def check_main_globals_for_unsafe_types():
|
186 | 293 | import inspect
|
|
0 commit comments