Internal change

saberkun · tensorflower-gardener · commit aac335491860 · 2022-03-25T16:25:43.000-07:00
PiperOrigin-RevId: 437358569
diff --git a/official/core/input_reader.py b/official/core/input_reader.py
@@ -160,22 +160,44 @@ def _read_tfds(tfds_builder: tfds.core.DatasetBuilder,
   """Reads a dataset from tfds."""
   # No op if exist.
   tfds_builder.download_and_prepare()
-
-  read_config = tfds.ReadConfig(
-      interleave_cycle_length=cycle_length,
-      interleave_block_length=block_length,
-      input_context=input_context,
-      shuffle_seed=seed)
   decoders = {}
   if tfds_skip_decoding_feature:
     for skip_feature in tfds_skip_decoding_feature.split(','):
       decoders[skip_feature.strip()] = tfds.decode.SkipDecoding()
-  dataset = tfds_builder.as_dataset(
-      split=tfds_split,
-      shuffle_files=is_training,
-      as_supervised=tfds_as_supervised,
-      decoders=decoders,
-      read_config=read_config)
+  if tfds_builder.info.splits:
+    num_shards = len(tfds_builder.info.splits[tfds_split].file_instructions)
+  else:
+    # The tfds mock path often does not provide splits.
+    num_shards = 1
+  if input_context and num_shards < input_context.num_input_pipelines:
+    # The number of files in the dataset split is smaller than the number of
+    # input pipelines. We read the entire dataset first and then shard in the
+    # host memory.
+    read_config = tfds.ReadConfig(
+        interleave_cycle_length=cycle_length,
+        interleave_block_length=block_length,
+        input_context=None,
+        shuffle_seed=seed)
+    dataset = tfds_builder.as_dataset(
+        split=tfds_split,
+        shuffle_files=is_training,
+        as_supervised=tfds_as_supervised,
+        decoders=decoders,
+        read_config=read_config)
+    dataset = dataset.shard(input_context.num_input_pipelines,
+                            input_context.input_pipeline_id)
+  else:
+    read_config = tfds.ReadConfig(
+        interleave_cycle_length=cycle_length,
+        interleave_block_length=block_length,
+        input_context=input_context,
+        shuffle_seed=seed)
+    dataset = tfds_builder.as_dataset(
+        split=tfds_split,
+        shuffle_files=is_training,
+        as_supervised=tfds_as_supervised,
+        decoders=decoders,
+        read_config=read_config)
 
   if is_training and not cache:
     dataset = dataset.repeat()