Skip to content

Commit 4082a43

Browse files
[vpj] Unify shared temp directories for VPJ (#1025)
Currently, VPJ uses temp directories for at least two features: 1. To store the output from ValidateSchemaAndBuildDict mapper job to pass the dictionary to the driver 2. To store schemas for TTL repush In the future, we might add more such cases where data needs to be stored in a temp directory. For operational reasons, it is desirable to have a temp directory that is shared by all VPJ jobs, and inside this, we can create other feature-specific shared directories that are also shared by all VPJ jobs. These shared directories will have 777 permissions so any user can write to it. If features have private data that need restricted permissions, the feature implementation can create files or subdirectories inside the feature directories and apply the restricted permissions to those. After this commit, the the temp directory will be: . |____<hadoop.tmp.dir> (Specified by env, or default /tmp) | |____venice-push-job (777 permissions) - shared by all VPJ | | |____<job.execution.id>_<unique-suffix> (700 permissions) - shared by all features in this execution | | | |____veniceMapperOutput (700 permissions) | | | |____rmd_schemas (700 permissions) | | | |____value_schemas (700 permissions) | | | |____...<features_added_in the future> (700 permissions) * Address review comments to fix code comments
1 parent d37e6a9 commit 4082a43

File tree

10 files changed

+180
-149
lines changed

10 files changed

+180
-149
lines changed

clients/venice-push-job/src/main/java/com/linkedin/venice/hadoop/PushJobSetting.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ public class PushJobSetting implements Serializable {
2525
public String jobId;
2626
public String jobExecutionId;
2727
public String jobServerName;
28+
// Path was not serializable till HDFS version 3.0.0, so we use URI instead:
29+
// https://issues.apache.org/jira/browse/HADOOP-13519
30+
public String sharedTmpDir;
31+
public String jobTmpDir;
2832
public boolean enableSSL;
2933
public Class<? extends VenicePushJob> vpjEntryClass;
3034
public String veniceControllerUrl;
@@ -58,7 +62,6 @@ public class PushJobSetting implements Serializable {
5862
public boolean compressionMetricCollectionEnabled;
5963
/** Refer {@link VenicePushJobConstants#USE_MAPPER_TO_BUILD_DICTIONARY} **/
6064
public boolean useMapperToBuildDict;
61-
public String useMapperToBuildDictOutputPath;
6265
public boolean repushTTLEnabled;
6366
// specify time to drop stale records.
6467
public long repushTTLStartTimeMs;

clients/venice-push-job/src/main/java/com/linkedin/venice/hadoop/ValidateSchemaAndBuildDictMapperOutputReader.java

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,18 +28,18 @@ public class ValidateSchemaAndBuildDictMapperOutputReader implements Closeable {
2828
private final InputStream inputStream;
2929
private final DataFileStream avroDataFileStream;
3030
private final FileSystem fs;
31-
private final String outputDir;
31+
private final Path outputDir;
3232

33-
public ValidateSchemaAndBuildDictMapperOutputReader(String outputDir, String fileName) throws Exception {
34-
Validate.notEmpty(
33+
public ValidateSchemaAndBuildDictMapperOutputReader(Path outputDir, String fileName) throws Exception {
34+
Validate.notNull(
3535
outputDir,
36-
ValidateSchemaAndBuildDictMapper.class.getSimpleName() + " output directory should not be empty");
36+
ValidateSchemaAndBuildDictMapper.class.getSimpleName() + " output directory should not be null");
3737
Validate.notEmpty(
3838
fileName,
3939
ValidateSchemaAndBuildDictMapper.class.getSimpleName() + " output fileName should not be empty");
4040

4141
this.outputDir = outputDir;
42-
Path filePath = new Path(String.format("%s/%s", outputDir, fileName));
42+
Path filePath = new Path(outputDir, fileName);
4343

4444
LOGGER.info(
4545
"Reading file {} to retrieve info persisted by {}",
@@ -94,7 +94,7 @@ public void close() {
9494

9595
// delete the output directory: It should not affect other VPJs as this is unique
9696
try {
97-
fs.delete(new Path(outputDir), true);
97+
fs.delete(outputDir, true);
9898
} catch (IOException e) {
9999
LOGGER.error("Failed to delete directory: {}", outputDir, e);
100100
}

clients/venice-push-job/src/main/java/com/linkedin/venice/hadoop/ValidateSchemaAndBuildDictOutputFormat.java

Lines changed: 2 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,13 @@
11
package com.linkedin.venice.hadoop;
22

33
import static com.linkedin.venice.hadoop.VenicePushJob.getValidateSchemaAndBuildDictionaryOutputFileNameNoExtension;
4-
import static com.linkedin.venice.hadoop.VenicePushJobConstants.MAPPER_OUTPUT_DIRECTORY;
54
import static com.linkedin.venice.hadoop.VenicePushJobConstants.VALIDATE_SCHEMA_AND_BUILD_DICT_MAPPER_OUTPUT_DIRECTORY;
65
import static org.apache.hadoop.mapreduce.MRJobConfig.ID;
76

87
import java.io.IOException;
98
import org.apache.avro.mapred.AvroOutputFormat;
109
import org.apache.hadoop.fs.FileSystem;
1110
import org.apache.hadoop.fs.Path;
12-
import org.apache.hadoop.fs.permission.FsPermission;
1311
import org.apache.hadoop.mapred.FileAlreadyExistsException;
1412
import org.apache.hadoop.mapred.FileOutputFormat;
1513
import org.apache.hadoop.mapred.JobConf;
@@ -30,37 +28,6 @@
3028
public class ValidateSchemaAndBuildDictOutputFormat extends AvroOutputFormat {
3129
private static final Logger LOGGER = LogManager.getLogger(ValidateSchemaAndBuildDictOutputFormat.class);
3230

33-
private static void createDirectoryWithPermission(FileSystem fs, Path path, String permission) throws IOException {
34-
createDirectoryWithPermission(fs, path, permission, false);
35-
}
36-
37-
private static void createDirectoryWithPermission(FileSystem fs, Path path, String permission, boolean deleteIfExists)
38-
throws IOException {
39-
LOGGER.info("Trying to create path {} with permission {}", path.getName(), permission);
40-
boolean createPath = false;
41-
// check if the path needs to be created
42-
if (fs.exists(path)) {
43-
if (deleteIfExists) {
44-
LOGGER.info("path {} exists already, but will be deleted and recreated", path);
45-
fs.delete(path, true);
46-
createPath = true;
47-
} else {
48-
LOGGER.info("path {} exists already", path);
49-
}
50-
} else {
51-
createPath = true;
52-
}
53-
54-
// create if needed
55-
if (createPath) {
56-
LOGGER.info("Creating path {} with permission {}", path.getName(), permission);
57-
fs.mkdirs(path);
58-
// mkdirs(path,permission) didn't set the right permission when
59-
// tested in hdfs, so splitting it like this, it works!
60-
fs.setPermission(path, new FsPermission(permission));
61-
}
62-
}
63-
6431
/**
6532
* 1. The parent directory should be accessible by every user/group (777)
6633
* 2. unique sub-directory for this VPJ should be accessible only by
@@ -70,21 +37,9 @@ private static void createDirectoryWithPermission(FileSystem fs, Path path, Stri
7037
* @param job mapred config
7138
* @throws IOException
7239
*/
73-
protected static void setValidateSchemaAndBuildDictionaryOutputDirPath(JobConf job) throws IOException {
74-
// parent directory: Common directory under which all the different push jobs
75-
// create their job specific directories.
76-
String parentOutputDir = job.get(MAPPER_OUTPUT_DIRECTORY);
77-
Path outputPath = new Path(parentOutputDir);
78-
FileSystem fs = outputPath.getFileSystem(job);
79-
createDirectoryWithPermission(fs, outputPath, "777");
80-
81-
// store+job specific unique directory under parent directory: already derived in VPJ driver
82-
// and passed along with the format: {$storeName}-{$JOB_EXEC_ID}-{$randomUniqueString}
83-
// this job creates it and VPJ driver deletes it after consuming the data in this directory
84-
// in ValidateSchemaAndBuildDictMapperOutputReader. setting 700 permissions for pii.
40+
protected static void setValidateSchemaAndBuildDictionaryOutputDirPath(JobConf job) {
8541
String fullOutputDir = job.get(VALIDATE_SCHEMA_AND_BUILD_DICT_MAPPER_OUTPUT_DIRECTORY);
86-
outputPath = new Path(fullOutputDir);
87-
createDirectoryWithPermission(fs, outputPath, "700");
42+
Path outputPath = new Path(fullOutputDir);
8843

8944
LOGGER.info(
9045
"{} Output will be stored in path: {}",

0 commit comments

Comments
 (0)