Skip to content

Commit 9fdedc4

Browse files
authored
[Improve][Connector][Hive] skip temporary hidden directories (#8402)
1 parent b6da060 commit 9fdedc4

File tree

2 files changed

+138
-1
lines changed

2 files changed

+138
-1
lines changed

seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/source/reader/AbstractReadStrategy.java

+4-1
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,10 @@ public List<String> getFileNamesByPath(String path) throws IOException {
110110
FileStatus[] stats = hadoopFileSystemProxy.listStatus(path);
111111
for (FileStatus fileStatus : stats) {
112112
if (fileStatus.isDirectory()) {
113-
fileNames.addAll(getFileNamesByPath(fileStatus.getPath().toString()));
113+
// skip hidden tmp directory, such as .hive-staging_hive
114+
if (!fileStatus.getPath().getName().startsWith(".")) {
115+
fileNames.addAll(getFileNamesByPath(fileStatus.getPath().toString()));
116+
}
114117
continue;
115118
}
116119
if (fileStatus.isFile() && filterFileByPattern(fileStatus) && fileStatus.getLen() > 0) {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.seatunnel.connectors.seatunnel.file.source.reader;
19+
20+
import org.apache.seatunnel.connectors.seatunnel.file.writer.ParquetReadStrategyTest;
21+
22+
import org.apache.avro.Schema;
23+
import org.apache.avro.generic.GenericArray;
24+
import org.apache.avro.generic.GenericData;
25+
import org.apache.avro.generic.GenericRecord;
26+
import org.apache.avro.util.Utf8;
27+
import org.apache.hadoop.conf.Configuration;
28+
import org.apache.hadoop.fs.Path;
29+
import org.apache.parquet.avro.AvroParquetWriter;
30+
import org.apache.parquet.hadoop.ParquetWriter;
31+
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
32+
33+
import org.junit.jupiter.api.Assertions;
34+
import org.junit.jupiter.api.Test;
35+
import org.junit.jupiter.api.condition.DisabledOnOs;
36+
import org.junit.jupiter.api.condition.OS;
37+
38+
import java.io.File;
39+
import java.io.IOException;
40+
import java.util.List;
41+
42+
import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_DEFAULT_NAME_DEFAULT;
43+
44+
public class AbstractReadStrategyTest {
45+
46+
@DisabledOnOs(OS.WINDOWS)
47+
@Test
48+
public void testReadDirectorySkipHiddenDirectories() throws Exception {
49+
AutoGenerateParquetData.generateTestData();
50+
try (ParquetReadStrategy parquetReadStrategy = new ParquetReadStrategy(); ) {
51+
ParquetReadStrategyTest.LocalConf localConf =
52+
new ParquetReadStrategyTest.LocalConf(FS_DEFAULT_NAME_DEFAULT);
53+
parquetReadStrategy.init(localConf);
54+
List<String> list =
55+
parquetReadStrategy.getFileNamesByPath(AutoGenerateParquetData.DATA_FILE_PATH);
56+
Assertions.assertEquals(1, list.size());
57+
Assertions.assertTrue(
58+
list.get(0).endsWith(AutoGenerateParquetData.DATA_FILE_PATH_KEEP));
59+
} finally {
60+
AutoGenerateParquetData.deleteFile(AutoGenerateParquetData.DATA_FILE_PATH);
61+
}
62+
}
63+
64+
public static class AutoGenerateParquetData {
65+
66+
public static final String DATA_FILE_PATH = "/tmp/tmp_1";
67+
public static final String DATA_FILE_PATH_KEEP = "/tmp/tmp_1/dt=20241230/00000";
68+
public static final String DATA_FILE_PATH_IGNORE = "/tmp/tmp_1/.hive-stage/00000";
69+
70+
public static void generateTestData() throws IOException {
71+
deleteFile(DATA_FILE_PATH);
72+
createFile(DATA_FILE_PATH_KEEP);
73+
createFile(DATA_FILE_PATH_IGNORE);
74+
}
75+
76+
public static void write(String filePath) throws IOException {
77+
String schemaString =
78+
"{\"type\":\"record\",\"name\":\"User\",\"fields\":[{\"name\":\"id\",\"type\":{\"type\": \"array\", \"items\": {\"type\": \"array\", \"items\": \"bytes\"}}},{\"name\":\"id2\",\"type\":{\"type\": \"array\", \"items\": {\"type\": \"array\", \"items\": \"bytes\"}}},{\"name\":\"long\",\"type\":\"long\"}]}";
79+
Schema schema = new Schema.Parser().parse(schemaString);
80+
81+
Configuration conf = new Configuration();
82+
83+
Path file = new Path(filePath);
84+
85+
ParquetWriter<GenericRecord> writer =
86+
AvroParquetWriter.<GenericRecord>builder(file)
87+
.withSchema(schema)
88+
.withConf(conf)
89+
.withCompressionCodec(CompressionCodecName.SNAPPY)
90+
.build();
91+
92+
GenericRecord record1 = new GenericData.Record(schema);
93+
GenericArray<GenericData.Array<Utf8>> id =
94+
new GenericData.Array<>(2, schema.getField("id").schema());
95+
id.add(new GenericData.Array<>(2, schema.getField("id").schema().getElementType()));
96+
id.add(new GenericData.Array<>(2, schema.getField("id").schema().getElementType()));
97+
record1.put("id", id);
98+
record1.put("id2", id);
99+
record1.put("long", Long.MAX_VALUE);
100+
writer.write(record1);
101+
writer.close();
102+
}
103+
104+
public static void createFile(String dir) throws IOException {
105+
File f2 = new File(dir);
106+
if (!f2.exists()) {
107+
if (!f2.getParentFile().exists()) {
108+
boolean b = f2.getParentFile().mkdirs();
109+
Assertions.assertTrue(b);
110+
}
111+
write(f2.getPath());
112+
}
113+
}
114+
115+
public static void deleteFile(String file) {
116+
File parquetFile = new File(file);
117+
if (parquetFile.exists()) {
118+
if (parquetFile.isDirectory()) {
119+
File[] l = parquetFile.listFiles();
120+
if (l != null) {
121+
for (File s : l) {
122+
deleteFile(s.getPath());
123+
}
124+
}
125+
boolean b = parquetFile.delete();
126+
Assertions.assertTrue(b);
127+
} else {
128+
boolean b = parquetFile.delete();
129+
Assertions.assertTrue(b);
130+
}
131+
}
132+
}
133+
}
134+
}

0 commit comments

Comments
 (0)