Skip to content

Commit 6f0d475

Browse files
authored
feat(restoreIndices): update restore indices args and docs (#12529)
1 parent 64aaaf1 commit 6f0d475

File tree

3 files changed

+62
-0
lines changed

3 files changed

+62
-0
lines changed

datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restoreindices/RestoreIndices.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,11 @@ public class RestoreIndices implements Upgrade {
2828
public static final String URN_BASED_PAGINATION_ARG_NAME = "urnBasedPagination";
2929

3030
public static final String STARTING_OFFSET_ARG_NAME = "startingOffset";
31+
public static final String LAST_URN_ARG_NAME = "lastUrn";
32+
public static final String LAST_ASPECT_ARG_NAME = "lastAspect";
33+
public static final String GE_PIT_EPOCH_MS_ARG_NAME = "gePitEpochMs";
34+
public static final String LE_PIT_EPOCH_MS_ARG_NAME = "lePitEpochMs";
35+
public static final String ASPECT_NAMES_ARG_NAME = "aspectNames";
3136

3237
private final List<UpgradeStep> _steps;
3338

datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restoreindices/SendMAEStep.java

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import io.ebean.Database;
1515
import io.ebean.ExpressionList;
1616
import java.util.ArrayList;
17+
import java.util.Arrays;
1718
import java.util.List;
1819
import java.util.Map;
1920
import java.util.NoSuchElementException;
@@ -123,6 +124,30 @@ private RestoreIndicesArgs getArgs(UpgradeContext context) {
123124
} else {
124125
context.report().addLine("No urnLike arg present");
125126
}
127+
if (containsKey(context.parsedArgs(), RestoreIndices.LE_PIT_EPOCH_MS_ARG_NAME)) {
128+
result.lePitEpochMs =
129+
Long.parseLong(context.parsedArgs().get(RestoreIndices.LE_PIT_EPOCH_MS_ARG_NAME).get());
130+
context.report().addLine(String.format("lePitEpochMs is %s", result.lePitEpochMs));
131+
}
132+
if (containsKey(context.parsedArgs(), RestoreIndices.GE_PIT_EPOCH_MS_ARG_NAME)) {
133+
result.gePitEpochMs =
134+
Long.parseLong(context.parsedArgs().get(RestoreIndices.GE_PIT_EPOCH_MS_ARG_NAME).get());
135+
context.report().addLine(String.format("gePitEpochMs is %s", result.gePitEpochMs));
136+
}
137+
if (containsKey(context.parsedArgs(), RestoreIndices.LAST_URN_ARG_NAME)) {
138+
result.lastUrn = context.parsedArgs().get(RestoreIndices.LAST_URN_ARG_NAME).get();
139+
context.report().addLine(String.format("lastUrn is %s", result.lastUrn));
140+
}
141+
if (containsKey(context.parsedArgs(), RestoreIndices.LAST_ASPECT_ARG_NAME)) {
142+
result.lastAspect = context.parsedArgs().get(RestoreIndices.LAST_ASPECT_ARG_NAME).get();
143+
context.report().addLine(String.format("lastAspect is %s", result.lastAspect));
144+
}
145+
if (containsKey(context.parsedArgs(), RestoreIndices.ASPECT_NAMES_ARG_NAME)) {
146+
result.aspectNames =
147+
Arrays.asList(
148+
context.parsedArgs().get(RestoreIndices.ASPECT_NAMES_ARG_NAME).get().split(","));
149+
context.report().addLine(String.format("aspectNames is %s", result.aspectNames));
150+
}
126151
return result;
127152
}
128153

docs/how/restore-indices.md

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,38 @@ By default, restoring the indices from the local database will not remove any ex
1111
the search and graph indices that no longer exist in the local database, potentially leading to inconsistencies
1212
between the search and graph indices and the local database.
1313

14+
## Configuration
15+
16+
The upgrade jobs take arguments as command line args to the job itself rather than environment variables for job specific configuration. The RestoreIndices job is specified through the `-u RestoreIndices` upgrade ID parameter and then additional parameters are specified like `-a batchSize=1000`.
17+
The following configurations are available:
18+
19+
### Time-Based Filtering
20+
21+
* `lePitEpochMs`: Restore records created before this timestamp (in milliseconds)
22+
* `gePitEpochMs`: Restore records created after this timestamp (in milliseconds)
23+
24+
### Pagination and Performance Options
25+
26+
* `urnBasedPagination`: Enable key-based pagination instead of offset-based pagination. Recommended for large datasets as it's typically more efficient.
27+
* `startingOffset`: When using default pagination, start from this offset
28+
* `lastUrn`: Resume from a specific URN when using URN-based pagination
29+
* `lastAspect`: Used with lastUrn to resume from a specific aspect, preventing reprocessing
30+
* `numThreads`: Number of concurrent threads for processing restoration, only used with default offset based paging
31+
* `batchSize`: Configures the size of each batch as the job pages through rows
32+
* `batchDelayMs`: Adds a delay in between each batch to avoid overloading backend systems
33+
34+
### Content Filtering
35+
36+
* `aspectNames`: Comma-separated list of aspects to restore (e.g., "ownership,status")
37+
* `urnLike`: SQL LIKE pattern to filter URNs (e.g., "urn:li:dataset%")
38+
39+
### Nuclear option
40+
* `clean`: This option wipes out the current indices by running deletes of all the documents to guarantee a consistent state with SQL. This is generally not recommended unless there is significant data corruption on the instance.
41+
42+
### Helm
43+
44+
These are available in the helm charts as configurations for Kubernetes deployments under the `datahubUpgrade.restoreIndices.args` path which will set them up as args to the pod command.
45+
1446
## Quickstart
1547

1648
If you're using the quickstart images, you can use the `datahub` cli to restore the indices.

0 commit comments

Comments
 (0)