added (#1838)

soyeric128 · web-flow · commit c1483bcad81b · 2025-04-15T18:43:53.000+08:00
diff --git a/docs/en/guides/40-load-data/03-load-semistructured/00-load-parquet.md b/docs/en/guides/40-load-data/03-load-semistructured/00-load-parquet.md
@@ -1,5 +1,5 @@
 ---
-title: Loading Parquet File into Databend
+title: Loading Parquet into Databend
 sidebar_label: Parquet
 ---
 
diff --git a/docs/en/guides/40-load-data/03-load-semistructured/01-load-csv.md b/docs/en/guides/40-load-data/03-load-semistructured/01-load-csv.md
@@ -1,5 +1,5 @@
 ---
-title: Loading CSV File into Databend
+title: Loading CSV into Databend
 sidebar_label: CSV
 ---
 
diff --git a/docs/en/guides/40-load-data/03-load-semistructured/02-load-tsv.md b/docs/en/guides/40-load-data/03-load-semistructured/02-load-tsv.md
@@ -1,5 +1,5 @@
 ---
-title: Loading TSV File into Databend
+title: Loading TSV into Databend
 sidebar_label: TSV
 ---
 
diff --git a/docs/en/guides/40-load-data/03-load-semistructured/03-load-ndjson.md b/docs/en/guides/40-load-data/03-load-semistructured/03-load-ndjson.md
@@ -1,5 +1,5 @@
 ---
-title: Loading NDJSON File into Databend
+title: Loading NDJSON into Databend
 sidebar_label: NDJSON
 ---
 
diff --git a/docs/en/guides/40-load-data/03-load-semistructured/04-load-orc.md b/docs/en/guides/40-load-data/03-load-semistructured/04-load-orc.md
@@ -1,5 +1,5 @@
 ---
-title: Loading ORC File into Databend
+title: Loading ORC into Databend
 sidebar_label: ORC
 ---
 
diff --git a/docs/en/guides/40-load-data/03-load-semistructured/05-load-avro.md b/docs/en/guides/40-load-data/03-load-semistructured/05-load-avro.md
@@ -0,0 +1,109 @@
+---
+title: Loading Avro into Databend
+sidebar_label: Avro
+---
+
+## What is Avro?
+
+[Apache Avro™](https://avro.apache.org/) is the leading serialization format for record data, and first choice for streaming data pipelines.
+
+## Loading Avro File
+
+The common syntax for loading AVRO file is as follows:
+
+```sql
+COPY INTO [<database>.]<table_name>
+     FROM { internalStage | externalStage | externalLocation }
+[ PATTERN = '<regex_pattern>' ]
+FILE_FORMAT = (TYPE = AVRO)
+```
+
+More details about the syntax can be found in [COPY INTO table](/sql/sql-commands/dml/dml-copy-into-table).
+
+## Tutorial: Loading Avro Data into Databend from Remote HTTP URL
+
+In this tutorial, you will create a table in Databend using an Avro schema and load Avro data directly from a GitHub-hosted `.avro` file via HTTPS.
+
+###  Step 1: Review the Avro Schema
+
+Before creating a table in Databend, let’s take a quick look at the Avro schema we’re working with: [userdata.avsc](https://github.com/Teradata/kylo/blob/master/samples/sample-data/avro/userdata.avsc). This schema defines a record named `User` with 13 fields, mostly of type string, along with `int` and `float`.
+
+```json
+{
+  "type": "record",
+  "name": "User",
+  "fields": [
+    {"name": "registration_dttm", "type": "string"},
+    {"name": "id", "type": "int"},
+    {"name": "first_name", "type": "string"},
+    {"name": "last_name", "type": "string"},
+    {"name": "email", "type": "string"},
+    {"name": "gender", "type": "string"},
+    {"name": "ip_address", "type": "string"},
+    {"name": "cc", "type": "string"},
+    {"name": "country", "type": "string"},
+    {"name": "birthdate", "type": "string"},
+    {"name": "salary", "type": "float"},
+    {"name": "title", "type": "string"},
+    {"name": "comments", "type": "string"}
+  ]
+}
+```
+
+###  Step 2: Create a Table in Databend
+
+Create a table that matches the structure defined in the schema:
+
+```sql
+CREATE TABLE userdata (
+  registration_dttm STRING,
+  id INT,
+  first_name STRING,
+  last_name STRING,
+  email STRING,
+  gender STRING,
+  ip_address STRING,
+  cc VARIANT,
+  country STRING,
+  birthdate STRING,
+  salary FLOAT,
+  title STRING,
+  comments STRING
+);
+```
+
+###  Step 3: Load Data from a Remote HTTPS URL
+
+```sql
+COPY INTO userdata
+FROM 'https://raw.githubusercontent.com/Teradata/kylo/master/samples/sample-data/avro/userdata1.avro'
+FILE_FORMAT = (type = avro);
+```
+
+```sql
+┌────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+│                             File                             │ Rows_loaded │ Errors_seen │    First_error   │ First_error_line │
+├──────────────────────────────────────────────────────────────┼─────────────┼─────────────┼──────────────────┼──────────────────┤
+│ Teradata/kylo/master/samples/sample-data/avro/userdata1.avro │        1000 │           0 │ NULL             │             NULL │
+└────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+```
+
+### Step 4: Query the Data
+
+You can now explore the data you just imported:
+
+```sql
+SELECT id, first_name, email, salary FROM userdata LIMIT 5;
+```
+
+```sql
+┌───────────────────────────────────────────────────────────────────────────────────┐
+│        id       │    first_name    │           email          │       salary      │
+├─────────────────┼──────────────────┼──────────────────────────┼───────────────────┤
+│               1 │ Amanda           │ ajordan0@com.com         │          49756.53 │
+│               2 │ Albert           │ afreeman1@is.gd          │         150280.17 │
+│               3 │ Evelyn           │ emorgan2@altervista.org  │         144972.52 │
+│               4 │ Denise           │ driley3@gmpg.org         │          90263.05 │
+│               5 │ Carlos           │ cburns4@miitbeian.gov.cn │              NULL │
+└───────────────────────────────────────────────────────────────────────────────────┘
+```
diff --git a/docs/en/guides/40-load-data/03-load-semistructured/index.md b/docs/en/guides/40-load-data/03-load-semistructured/index.md
@@ -17,14 +17,4 @@ Copy from semi-structured data format is the most common way to load data into D
 
 Databend supports several semi-structured data formats loaded using the `COPY INTO` command:
 
-- **Parquet**: A columnar storage format, ideal for optimizing data storage and retrieval. It is best suited for complex data structures and offers efficient data compression and encoding schemes.
-
-- **CSV (Comma-Separated Values)**: A simple format that is widely used for data exchange. CSV files are easy to read and write but might not be ideal for complex hierarchical data structures.
-
-- **TSV (Tab-Separated Values)**: Similar to CSV, but uses tabs as delimiters. It's often used for data with simple structures that require a delimiter other than a comma.
-
-- **NDJSON (Newline Delimited JSON)**: This format represents JSON data with each JSON object separated by a newline. It is particularly useful for streaming large datasets and handling data that changes frequently. NDJSON facilitates the processing of large volumes of data by breaking it down into manageable, line-delimited chunks.
-
-
-For detailed instructions on how to load semi-structured data, check out the following topics:
 <IndexOverviewList />
diff --git a/docs/en/sql-reference/00-sql-reference/50-file-format-options.md b/docs/en/sql-reference/00-sql-reference/50-file-format-options.md
@@ -13,13 +13,13 @@ To specify a file format in a statement, use the following syntax:
 
 ```sql
 -- Specify a standard file format
-... FILE_FORMAT = ( TYPE = { CSV | TSV | NDJSON | PARQUET | ORC } [ formatTypeOptions ] )
+... FILE_FORMAT = ( TYPE = { CSV | TSV | NDJSON | PARQUET | ORC | AVRO } [ formatTypeOptions ] )
 
 -- Specify a custom file format
 ... FILE_FORMAT = ( FORMAT_NAME = '<your-custom-format>' )
 ```
 
-- Databend currently supports ORC as a source ONLY. Unloading data into an ORC file is not supported yet.
+- Databend currently supports ORC and AVRO as a source ONLY. Unloading data into an ORC or AVRO file is not supported yet.
 - If you don't specify the FILE_FORMAT when performing a COPY INTO or SELECT operation from a stage, Databend will use the file format that you initially defined for the stage when you created it. In cases where you didn't explicitly specify a file format during the stage creation, Databend defaults to using the PARQUET format. If you specify a different FILE_FORMAT from the one you defined when creating the stage, Databend will prioritize the FILE_FORMAT specified during the operation.
 - For managing custom file formats in Databend, see [File Format](../10-sql-commands/00-ddl/13-file-format/index.md).
 
@@ -251,3 +251,15 @@ Determines the behavior when encountering missing fields during data loading. Re
 |------------------|-----------------------------------------------------------------------------------------------|
 | `ERROR` (Default)| Generates an error if a missing field is encountered.                                         |
 | `FIELD_DEFAULT`  | Uses the default value of the field for missing fields.                                       |
+
+
+## AVRO Options
+
+### MISSING_FIELD_AS (Load Only)
+
+Determines the behavior when encountering missing fields during data loading. Refer to the options in the table below for possible configurations.
+
+| Available Values | Description                                                                                   |
+|------------------|-----------------------------------------------------------------------------------------------|
+| `ERROR` (Default)| Generates an error if a missing field is encountered.                                         |
+| `FIELD_DEFAULT`  | Uses the default value of the field for missing fields.                                       |
diff --git a/docs/en/sql-reference/10-sql-commands/10-dml/dml-copy-into-table.md b/docs/en/sql-reference/10-sql-commands/10-dml/dml-copy-into-table.md
@@ -5,7 +5,7 @@ sidebar_label: "COPY INTO <table>"
 
 import FunctionDescription from '@site/src/components/FunctionDescription';
 
-<FunctionDescription description="Introduced or updated: v1.2.666"/>
+<FunctionDescription description="Introduced or updated: v1.2.704"/>
 
 COPY INTO allows you to load data from files located in one of the following locations:
 
@@ -27,7 +27,7 @@ COPY INTO [<database_name>.]<table_name>
 [ PATTERN = '<regex_pattern>' ]
 [ FILE_FORMAT = (
          FORMAT_NAME = '<your-custom-format>'
-         | TYPE = { CSV | TSV | NDJSON | PARQUET | ORC } [ formatTypeOptions ]
+         | TYPE = { CSV | TSV | NDJSON | PARQUET | ORC | AVRO } [ formatTypeOptions ]
        ) ]
 [ copyOptions ]
 ```