forked from apache/doris-website
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcheck_move_global.py
More file actions
166 lines (140 loc) · 6.74 KB
/
check_move_global.py
File metadata and controls
166 lines (140 loc) · 6.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
# Detect global dead links
#
# Core logic:
# Traverse all documents, match the links in the documents, and determine whether it is a dead link by the link address;
# if it is a dead link, try to fix it. If the fix fails, it will print: ❌ xxxx/xxxx.md: Could not fix broken link ${target_link};
# if the fix is successful, it will print: 🛠️ xxxx/xxxx.md: Fixed broken link ${dead_link} -> ${link}
#
# Repair the logic of broken links:
# Traverse all the documents in the directory with the current broken link layer by layer to see if the document name is consistent with the document name in the broken link.
# If they are consistent, the current directory is considered to be the correct directory of the broken link.
# The above situation is the case where the original link document directory has been migrated. If the document is deleted, the correction will fail.
#
# Absolute paths or broken links starting with http/https cannot be judged
import argparse
import subprocess
import re
import os
import sys
from typing import AnyStr, List
from urllib.parse import urlparse
move_pairs = []
deletes = []
change_detected = False
search_dirs = ["docs", "i18n", "versioned_docs", "community"]
def is_same_file(path1, path2):
return os.path.normpath(path1) == os.path.normpath(path2)
def remove_suffix(text: str, suffix: str):
if text.endswith(suffix):
return text[: -len(suffix)]
return text
def find_nearest_file(file_base, start_dir):
"""
Look for the nearest file_base (.md/.mdx) in start_dir upwards, otherwise search globally
"""
cur_dir = start_dir
# Search up to 10 levels upwards to avoid stuck
for _ in range(10):
for ext in [".md", ".mdx"]:
candidate = os.path.join(cur_dir, file_base + ext)
if os.path.exists(candidate):
return candidate
parent = os.path.dirname(cur_dir)
if parent == cur_dir:
break
cur_dir = parent
# Global Search
for base_dir in search_dirs:
for root, dirs, files in os.walk(base_dir):
for file in files:
if (file == file_base + ".md") or (file == file_base + ".mdx"):
return os.path.join(root, file)
return None
def process_md_file(file_path):
global change_detected
link_pattern = re.compile(r"\[.*?\]\((.*?)\)")
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
links = link_pattern.findall(content)
new_content = content
for link in links:
if not urlparse(link).scheme and not os.path.isabs(link):
full_path = os.path.normpath(os.path.join(os.path.dirname(file_path), link))
if not full_path.endswith(".md") and not full_path.endswith(".mdx"):
full_path += ".md"
# Handling rename situations
for [from_path, to_path] in move_pairs:
from_base, from_ext = os.path.splitext(from_path)
to_base, to_ext = os.path.splitext(to_path)
if (from_ext in [".md", ".mdx", ""] or to_ext in [".md", ".mdx", ""]) and (from_base == to_base):
continue
if is_same_file(full_path, from_path):
relative_to_path = os.path.relpath(to_path, os.path.dirname(file_path))
relative_to_path = remove_suffix(relative_to_path, ".md")
relative_to_path = remove_suffix(relative_to_path, ".mdx")
print(f"🔄 {file_path}: Updated moved link {link} -> {relative_to_path}")
new_content = new_content.replace(f"({link})", f"({relative_to_path})")
change_detected = True
# Handling delete cases
for deleted_path in deletes:
if is_same_file(full_path, deleted_path):
print(f"⚠️ {file_path}: Link to deleted file {link}")
change_detected = True
# Dealing with broken link repair
if not os.path.exists(full_path):
# Indicates that the current link is broken
file_base = os.path.basename(link)
file_base = remove_suffix(file_base, ".md")
file_base = remove_suffix(file_base, ".mdx")
found_path = find_nearest_file(file_base, os.path.dirname(file_path))
if found_path:
relative_to_path = os.path.relpath(found_path, os.path.dirname(file_path))
relative_to_path = remove_suffix(relative_to_path, ".md")
relative_to_path = remove_suffix(relative_to_path, ".mdx")
if "version-1.2" not in file_path and "version-2.0" not in file_path:
print(f"🛠️ {file_path}: Fixed broken link {link} -> {relative_to_path}")
new_content = new_content.replace(f"({link})", f"({relative_to_path})")
change_detected = True
else:
if "version-1.2" not in file_path and "version-2.0" not in file_path:
print(f"❌ {file_path}: Could not fix broken link {link}")
change_detected = True
if new_content != content:
with open(file_path, "w", encoding="utf-8") as f:
f.write(new_content)
def extract_file_changes(git_show_output: List[AnyStr]):
print(f"Parsing commit lines...")
content = b"".join(git_show_output).decode()
move_pattern = r"rename from (.+?)\nrename to (.+?)\n"
move_matches = re.findall(move_pattern, content, re.DOTALL | re.MULTILINE)
print(f"Moved files detected: {len(move_matches)}")
delete_pattern = r"diff --git a/(\S+) b/\1\ndeleted file mode \d+\nindex .+"
delete_matches = re.findall(delete_pattern, content, re.DOTALL | re.MULTILINE)
print(f"Deleted files detected: {len(delete_matches)}")
global move_pairs
global deletes
move_pairs = move_matches
deletes = delete_matches
def travel(root_path: str):
for root, dirs, files in os.walk(root_path):
for file in files:
if file.endswith(".md") or file.endswith(".mdx"):
process_md_file(os.path.join(root, file))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Fix moved/deleted/broken md links for a commit")
parser.add_argument("commit_id", type=str, help="Git commit id to check")
args = parser.parse_args()
p = subprocess.Popen(
"git show " + args.commit_id,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
)
extract_file_changes(p.stdout.readlines())
for dir in search_dirs:
travel(dir)
if change_detected:
print("❗ Link issues detected and/or fixed.")
sys.exit(1)
else:
print("✅ No issues detected.")