Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add json pre-validation before parsing manifest file to proto in TextImport #1532

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 6 additions & 0 deletions v1/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
<cassandra.driver.version>3.6.0</cassandra.driver.version>
<kms.version>1.40.0</kms.version>
<proto-kms.version>0.87.0</proto-kms.version>
<json.version>20240303</json.version>
<jacoco.version>0.8.8</jacoco.version>
<excluded.spanner.tests>com.google.cloud.teleport.spanner.IntegrationTest</excluded.spanner.tests>
</properties>
Expand Down Expand Up @@ -542,6 +543,11 @@
<groupId>com.google.cloud</groupId>
<artifactId>google-cloud-secretmanager</artifactId>
</dependency>
<dependency>
<groupId>org.json</groupId>
<artifactId>json</artifactId>
<version>${json.version}</version>
</dependency>
</dependencies>

<build>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,10 @@
import org.apache.beam.sdk.values.TupleTag;
import org.apache.beam.sdk.values.TupleTagList;
import org.apache.beam.sdk.values.TypeDescriptor;
import org.apache.commons.io.input.BOMInputStream;
import org.joda.time.Duration;
import org.json.JSONObject;
import org.json.JSONTokener;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand Down Expand Up @@ -374,8 +377,9 @@ public void processElement(ProcessContext c) {
private static ImportManifest readManifest(ResourceId fileResource) {
ImportManifest.Builder result = ImportManifest.newBuilder();
try (InputStream stream = Channels.newInputStream(FileSystems.open(fileResource))) {
Reader reader = new InputStreamReader(stream, StandardCharsets.UTF_8);
JsonFormat.parser().merge(reader, result);
Reader reader = new InputStreamReader(new BOMInputStream(stream), StandardCharsets.UTF_8);
JSONObject json = new JSONObject(new JSONTokener(reader));
JsonFormat.parser().merge(json.toString(), result);
} catch (IOException e) {
throw new RuntimeException(
"Failed to read manifest. Make sure it is ASCII or UTF-8 encoded and contains a"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,53 @@ public void readImportManifest() throws Exception {
pipeline.run();
}

@Test
public void readImportManifestWithExtraCommas() throws Exception {
Path f11 = Files.createTempFile("table1-file", "1");
Path f12 = Files.createTempFile("table1-file", "2");
Path f13 = Files.createTempFile("table1-file", "3");
Path f21 = Files.createTempFile("table2-file", "1");
Path f22 = Files.createTempFile("table2-file", "2");
String tempDir = f11.getParent().toString();

Path manifestFile = Files.createTempFile("import-manifest", ".json");
Charset charset = Charset.forName("UTF-8");
try (BufferedWriter writer = Files.newBufferedWriter(manifestFile, charset)) {
String jsonString =
String.format(
"{\"tables\": ["
+ "{\"table_name\": \"table1\","
+ "\"file_patterns\":[\"%s\",\"%s\"],},"
+ "{\"table_name\": \"table2\","
+ "\"file_patterns\":[\"%s\",]}"
+ "]}",
f11.toString(), f12.toString(), f21.toString());
writer.write(jsonString, 0, jsonString.length());
} catch (IOException e) {
e.printStackTrace();
}

ValueProvider<String> importManifest =
ValueProvider.StaticValueProvider.of(manifestFile.toString());
PCollectionView<Ddl> ddlView =
pipeline.apply("ddl", Create.of(getTestDdl())).apply(View.asSingleton());

PCollection<KV<String, String>> tableAndFiles =
pipeline
.apply("Read manifest file", new ReadImportManifest(importManifest))
.apply("Resolve data files", new ResolveDataFiles(importManifest, ddlView));

// Validates that only the file patterns specified in manifest will be returned.
// E.g., f13 and f22 are not in the tableAndFiles result.
PAssert.that(tableAndFiles)
.containsInAnyOrder(
KV.of("table1", f11.toString()),
KV.of("table1", f12.toString()),
KV.of("table2", f21.toString()));

pipeline.run();
}

@Test
public void readImportManifestUtfWithBOM() throws Exception {
Path f11 = Files.createTempFile("table1-file", "1");
Expand Down