Skip to content

Commit 25d182e

Browse files
authored
chore(ci): try to check missing files (#1027)
* chore(ci): try to check missing files Signed-off-by: Chojan Shang <[email protected]> * chore(ci): clean workflow Signed-off-by: Chojan Shang <[email protected]> * chore(scripts): minor update Signed-off-by: Chojan Shang <[email protected]> * chore(scripts): allow skipping some files and directories Signed-off-by: Chojan Shang <[email protected]> * chore(scripts): minor update Signed-off-by: Chojan Shang <[email protected]> --------- Signed-off-by: Chojan Shang <[email protected]>
1 parent 80c9f99 commit 25d182e

File tree

7 files changed

+172
-13
lines changed

7 files changed

+172
-13
lines changed

.github/workflows/docs-commit.translate.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@ jobs:
3838
with:
3939
apikey: ${{ secrets.OPENAI_API_KEY }}
4040
model: "deepseek-chat"
41-
inputFiles: "${{ steps.changed_files.outputs.files }}"
42-
outputFiles: "docs/cn/**/*.md"
43-
languages: "Simplified-Chinese"
44-
prompt: "You are a translation engine that has knowledge of databases and is familiar with SQL, HTML and Markdown syntax. \ Databend is a cloud-native data warehouse and an alternative to Snowflake. \ I am translating the Databend documentation for helping users.\ Translate the Markdown content I'll paste later into {targetLanguage}.\ You must strictly follow the rules below.\ - Never change the Markdown markup structure. Don't add or remove links. Do not change any URL.\ - Never change the contents of code blocks even if they appear to have a bug.\ - Always preserve the original line breaks. Do not add or remove blank lines.\ - Never touch the permalink such as `{/*examples*/}` at the end of each heading.\ - Never touch HTML-like tags such as `<Notes>`.\ - Correctly format the document for best rendering. \ - Please do not translate database or computing-specific terms"
41+
inputFiles: '${{ steps.changed_files.outputs.files }}'
42+
outputFiles: 'docs/cn/**/*.md'
43+
languages: 'Simplified-Chinese'
44+
prompt: "You are a translation engine that has knowledge of databases and is familiar with SQL, HTML, Markdown and JSON syntax. \ Databend is a cloud-native data warehouse and an alternative to Snowflake. \ I am translating the Databend documentation for helping users.\ Translate the Markdown or JSON content I'll paste later into {targetLanguage}.\ You must strictly follow the rules below.\ - Never change the Markdown markup structure. Don't add or remove links. Do not change any URL.\ - Never change the contents of code blocks even if they appear to have a bug.\ - Always preserve the original line breaks. Do not add or remove blank lines.\ - Never touch the permalink such as `{/*examples*/}` at the end of each heading.\ - Never touch HTML-like tags such as `<Notes>`.\ - Correctly format the document for best rendering. \ - Please do not translate database or computing-specific terms.\ -Keep the structure consistent with the source document and do not delete anything."
4545
basePath: "https://api.deepseek.com"
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
name: GPT Translate Sync
2+
3+
on: workflow_dispatch
4+
5+
permissions:
6+
id-token: write
7+
pull-requests: write
8+
checks: write
9+
statuses: write
10+
contents: write
11+
12+
jobs:
13+
gpt_translate:
14+
runs-on: ubuntu-latest
15+
16+
steps:
17+
- name: Checkout
18+
uses: actions/checkout@v4
19+
with:
20+
fetch-depth: 0
21+
22+
- name: Delete unsync files
23+
id: delete_unsync
24+
run: |
25+
chmod +x ./scripts/delete_unsync.sh
26+
source ./scripts/delete_unsync.sh
27+
28+
- name: Commit changes
29+
uses: EndBug/add-and-commit@v9
30+
with:
31+
author_name: ${{ github.actor }}
32+
author_email: ${{ github.actor }}@users.noreply.github.com
33+
message: 'chore: delete unsync files'
34+
push: false
35+
36+
- name: Check missing files
37+
id: check_missing
38+
run: |
39+
chmod +x ./scripts/check_missing.sh
40+
source ./scripts/check_missing.sh
41+
echo "missing_md_files=$(echo $missing_md_files | tr '\n' ' ')" >> $GITHUB_OUTPUT
42+
echo "missing_json_files=$(echo $missing_json_files | tr '\n' ' ')" >> $GITHUB_OUTPUT
43+
44+
- name: Log Missing Files
45+
run: |
46+
echo "Markdown Files Missing: ${{ steps.check_missing.outputs.missing_md_files }}"
47+
echo "JSON Files Missing: ${{ steps.check_missing.outputs.missing_json_files }}"
48+
49+
- name: Run GPT Translate
50+
51+
with:
52+
apikey: ${{ secrets.OPENAI_API_KEY }}
53+
model: "deepseek-chat"
54+
inputFiles: "${{ steps.check_missing.outputs.missing_md_files }} ${{ steps.check_missing.outputs.missing_json_files }}"
55+
outputFiles: "docs/cn/**/*"
56+
languages: "Simplified-Chinese"
57+
prompt: "You are a translation engine that has knowledge of databases and is familiar with SQL, HTML, Markdown and JSON syntax. \ Databend is a cloud-native data warehouse and an alternative to Snowflake. \ I am translating the Databend documentation for helping users.\ Translate the Markdown or JSON content I'll paste later into {targetLanguage}.\ You must strictly follow the rules below.\ - Never change the Markdown markup structure. Don't add or remove links. Do not change any URL.\ - Never change the contents of code blocks even if they appear to have a bug.\ - Always preserve the original line breaks. Do not add or remove blank lines.\ - Never touch the permalink such as `{/*examples*/}` at the end of each heading.\ - Never touch HTML-like tags such as `<Notes>`.\ - Correctly format the document for best rendering. \ - Please do not translate database or computing-specific terms.\ -Keep the structure consistent with the source document and do not delete anything."
58+
basePath: "https://api.deepseek.com"

.github/workflows/docs.translate.dir.yaml

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -35,19 +35,24 @@ jobs:
3535
git add .
3636
3737
- name: Get all files in specified directory
38-
id: all_input_files
38+
id: check_missing
3939
run: |
4040
TRANSLATION_DIR="${{ github.event.inputs.path }}"
41-
echo "files=$(find $TRANSLATION_DIR -type f \( -name '*.md' \) | sed -e 's/^/.\//' | tr '\n' ' ')" >> $GITHUB_OUTPUT
41+
echo "missing_md_files=$(find $TRANSLATION_DIR -type f \( -name '*.md' \) | sed -e 's/^/.\//' | tr '\n' ' ')" >> $GITHUB_OUTPUT
42+
echo "missing_json_files=$(find $TRANSLATION_DIR -type f \( -name '*.json' \) | sed -e 's/^/.\//' | tr '\n' ' ')" >> $GITHUB_OUTPUT
43+
44+
- name: Log Missing Files
45+
run: |
46+
echo "Markdown Files Missing: ${{ steps.check_missing.outputs.missing_md_files }}"
47+
echo "JSON Files Missing: ${{ steps.check_missing.outputs.missing_json_files }}"
4248
4349
- name: Run GPT Translate
4450
4551
with:
4652
apikey: ${{ secrets.OPENAI_API_KEY }}
4753
model: "deepseek-chat"
48-
inputFiles: '${{ steps.all_input_files.outputs.files }}'
49-
outputFiles: 'docs/cn/**/*.md'
50-
languages: 'Simplified-Chinese'
51-
prompt: "You are a translation engine that has knowledge of databases and is familiar with SQL, HTML and Markdown syntax. \ Databend is a cloud-native data warehouse and an alternative to Snowflake. \ I am translating the Databend documentation for helping users.\ Translate the Markdown content I'll paste later into {targetLanguage}.\ You must strictly follow the rules below.\ - Never change the Markdown markup structure. Don't add or remove links. Do not change any URL.\ - Never change the contents of code blocks even if they appear to have a bug.\ - Always preserve the original line breaks. Do not add or remove blank lines.\ - Never touch the permalink such as `{/*examples*/}` at the end of each heading.\ - Never touch HTML-like tags such as `<Notes>`.\ - Correctly format the document for best rendering. \ - Please do not translate database or computing-specific terms.\ -Keep the structure consistent with the source document and do not delete anything."
54+
inputFiles: "${{ steps.check_missing.outputs.missing_md_files }} ${{ steps.check_missing.outputs.missing_json_files }}"
55+
outputFiles: "docs/cn/**/*"
56+
languages: "Simplified-Chinese"
57+
prompt: "You are a translation engine that has knowledge of databases and is familiar with SQL, HTML, Markdown and JSON syntax. \ Databend is a cloud-native data warehouse and an alternative to Snowflake. \ I am translating the Databend documentation for helping users.\ Translate the Markdown or JSON content I'll paste later into {targetLanguage}.\ You must strictly follow the rules below.\ - Never change the Markdown markup structure. Don't add or remove links. Do not change any URL.\ - Never change the contents of code blocks even if they appear to have a bug.\ - Always preserve the original line breaks. Do not add or remove blank lines.\ - Never touch the permalink such as `{/*examples*/}` at the end of each heading.\ - Never touch HTML-like tags such as `<Notes>`.\ - Correctly format the document for best rendering. \ - Please do not translate database or computing-specific terms.\ -Keep the structure consistent with the source document and do not delete anything."
5258
basePath: "https://api.deepseek.com"
53-
token: ${{ secrets.GITHUB_TOKEN }}

.github/workflows/docs.translate.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,5 +18,6 @@ jobs:
1818
with:
1919
apikey: ${{ secrets.OPENAI_API_KEY }}
2020
model: "deepseek-chat"
21-
prompt: "You are a translation engine that has knowledge of databases and is familiar with SQL, HTML and Markdown syntax. \ Databend is a cloud-native data warehouse and an alternative to Snowflake. \ I am translating the Databend documentation for helping users.\ Translate the Markdown content I'll paste later into {targetLanguage}.\ You must strictly follow the rules below.\ - Never change the Markdown markup structure. Don't add or remove links. Do not change any URL.\ - Never change the contents of code blocks even if they appear to have a bug.\ - Always preserve the original line breaks. Do not add or remove blank lines.\ - Never touch the permalink such as `{/*examples*/}` at the end of each heading.\ - Never touch HTML-like tags such as `<Notes>`.\ - Correctly format the document for best rendering. \ - Please do not translate database or computing-specific terms"
22-
basePath: "https://api.deepseek.com"
21+
prompt: "You are a translation engine that has knowledge of databases and is familiar with SQL, HTML, Markdown and JSON syntax. \ Databend is a cloud-native data warehouse and an alternative to Snowflake. \ I am translating the Databend documentation for helping users.\ Translate the Markdown or JSON content I'll paste later into {targetLanguage}.\ You must strictly follow the rules below.\ - Never change the Markdown markup structure. Don't add or remove links. Do not change any URL.\ - Never change the contents of code blocks even if they appear to have a bug.\ - Always preserve the original line breaks. Do not add or remove blank lines.\ - Never touch the permalink such as `{/*examples*/}` at the end of each heading.\ - Never touch HTML-like tags such as `<Notes>`.\ - Correctly format the document for best rendering. \ - Please do not translate database or computing-specific terms.\ -Keep the structure consistent with the source document and do not delete anything."
22+
basePath: "https://api.deepseek.com"
23+

scripts/check_missing.sh

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
#!/bin/bash
2+
3+
# Define English directory and Chinese directory.
4+
en_dir="docs/en"
5+
cn_dir="docs/cn"
6+
7+
# Create temporary files.
8+
temp_md=$(mktemp)
9+
temp_json=$(mktemp)
10+
11+
# Search for files that exist in the "en" directory but do not exist in the "cn" directory, and process them.
12+
find "$en_dir" -type f \( -name "*.md" -o -name "*.json" \) -print0 | while IFS= read -r -d '' en_file; do
13+
# Get the path of en_file relative to en_dir.
14+
en_relative_path="${en_file#$en_dir/}"
15+
# Construct the corresponding cn_file path.
16+
cn_file="$cn_dir/$en_relative_path"
17+
18+
# Check if the cn_file exists.
19+
if [ ! -e "$cn_file" ]; then
20+
case "$en_relative_path" in
21+
*.md)
22+
echo "./$en_dir/$en_relative_path" >> "$temp_md"
23+
;;
24+
*.json)
25+
echo "./$en_dir/$en_relative_path" >> "$temp_json"
26+
;;
27+
esac
28+
fi
29+
done
30+
31+
# Read temporary files into variables.
32+
missing_md_files=$(cat "$temp_md")
33+
missing_json_files=$(cat "$temp_json")
34+
35+
# Output results.
36+
echo "Missing Markdown files:"
37+
echo "$missing_md_files"
38+
echo "Missing JSON files:"
39+
echo "$missing_json_files"
40+
41+
# Clean up temporary files.
42+
rm "$temp_md" "$temp_json"
43+

scripts/delete_unsync.sh

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
#!/bin/bash
2+
3+
# Define English directory and Chinese directory.
4+
en_dir="docs/en"
5+
cn_dir="docs/cn"
6+
7+
# Read the exclusion list into an array.
8+
mapfile -t exclusions < sync_exclusions.txt
9+
10+
# Check if the path is excluded.
11+
is_excluded() {
12+
local path="$1"
13+
for exclude in "${exclusions[@]}"; do
14+
# 匹配完整路径或子路径
15+
if [[ "$path" == "$exclude"* ]]; then
16+
return 0 # 路径被排除
17+
fi
18+
done
19+
return 1 # 路径不被排除
20+
}
21+
22+
# Check and delete the directory if it is empty.
23+
remove_empty_dir() {
24+
local dir="$1"
25+
# 只处理 cn_dir 下的子目录
26+
if [[ "$dir" != "$cn_dir" ]] && [ -d "$dir" ] && [ -z "$(ls -A "$dir")" ] && ! is_excluded "$dir"; then
27+
rmdir "$dir"
28+
echo "Removed empty directory: $dir"
29+
fi
30+
}
31+
32+
# Search for files that exist in the cn directory but do not exist in the en directory, and delete them.
33+
find "$cn_dir" -type f \( -name "*.md" -o -name "*.json" \) -print0 | while IFS= read -r -d '' cn_file; do
34+
# Get the path of cn_file relative to cn_dir.
35+
cn_relative_path="${cn_file#$cn_dir/}"
36+
# Construct the corresponding en_file path.
37+
en_file="$en_dir/$cn_relative_path"
38+
39+
# Check if en_file exists and cn_file is not in the exclusion list.
40+
if [ ! -e "$en_file" ] && ! is_excluded "$cn_file"; then
41+
rm -rf "$cn_file"
42+
echo "Removed file: $cn_file"
43+
fi
44+
done
45+
46+
# Delete empty directories.
47+
find "$cn_dir" -type d -empty -print0 | while IFS= read -r -d '' dir; do
48+
remove_empty_dir "${dir}"
49+
done

sync_exclusions.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
docs/cn/tutorials/01-taobao.md
2+
docs/cn/guides/20-cloud
3+
docs/cn/guides/00-overview/00-editions/02-dc

0 commit comments

Comments
 (0)