Skip to content

Commit 6362ff4

Browse files
feat(cluster): support system-managed cluster (#17051)
* feat(cluster): support custom management cluster * feat(cluster): support custom management cluster * feat(cluster): support custom management cluster * feat(cluster): support custom management cluster * feat(cluster): support custom management cluster * feat(cluster): support custom management cluster * feat(cluster): support custom management cluster * feat(cluster): support custom management cluster * feat(cluster): support custom management cluster * feat(cluster): support custom management cluster * feat(cluster): support custom management cluster * feat(cluster): support custom management cluster * feat(cluster): support custom management cluster * feat(cluster): support custom management cluster * feat(cluster): support custom management cluster * feat(cluster): support custom management cluster * feat(cluster): support custom management cluster * feat(cluster): support custom management cluster * feat(cluster): support custom management cluster * feat(cluster): support custom management cluster * feat(cluster): support custom management cluster * feat(cluster): support custom management cluster * feat(cluster): support custom management cluster * feat(cluster): support custom management cluster * feat(cluster): support custom management cluster * feat(cluster): support custom management cluster * feat(cluster): support custom management cluster * feat(cluster): support custom management cluster * feat(cluster): support custom management cluster * feat(cluster): support custom management cluster * feat(cluster): support custom management cluster * feat(cluster): support custom management cluster * feat(cluster): support custom management cluster * feat(cluster): support custom management cluster * refactor: improve and clean up `warehouse_mgr::upsert_self_managed()` ### Improvements: - Added retry mechanism with a fallback in the retry loop. - Return an error if an unexpected response is received when `TxnGetResponse` is expected. - Refined quit-retry condition: now only triggered when the seq of `NodeInfo` changes. ### Refactoring: - Simplified and decoupled nested branching for better readability and maintainability. - Consolidated related logic, e.g., building `txn if_then` operations in a single place. - Differentiated `NodeInfo` with and without warehouse-related information. ### Documentation: - Added details explaining behavioral differences between insert and update modes. * feat(cluster): clean code * feat(cluster): clean code * feat(cluster): clean code * feat(cluster): clean code * feat(cluster): clean code * feat(cluster): clean code * feat(cluster): clean code * feat(cluster): clean code * feat(cluster): clean code * feat(cluster): clean code * feat(cluster): clean code * feat(cluster): clean code * feat(cluster): clean code * feat(cluster): clean code * feat(cluster): add concurrent unit test * feat(cluster): add concurrent unit test * feat(cluster): clean code * feat(cluster): clean code --------- Co-authored-by: 张炎泼 <[email protected]>
1 parent 783d155 commit 6362ff4

File tree

83 files changed

+7228
-627
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

83 files changed

+7228
-627
lines changed

Diff for: Cargo.lock

+15
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: Cargo.toml

+2
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ members = [
8585
"src/query/ee_features/storage_quota",
8686
"src/query/ee_features/inverted_index",
8787
"src/query/ee_features/virtual_column",
88+
"src/query/ee_features/resources_management",
8889
"src/query/service",
8990
"src/query/ee",
9091
"src/meta/api",
@@ -185,6 +186,7 @@ databend-enterprise-fail-safe = { path = "src/query/ee_features/fail_safe" }
185186
databend-enterprise-inverted-index = { path = "src/query/ee_features/inverted_index" }
186187
databend-enterprise-meta = { path = "src/meta/ee" }
187188
databend-enterprise-query = { path = "src/query/ee" }
189+
databend-enterprise-resources-management = { path = "src/query/ee_features/resources_management" }
188190
databend-enterprise-storage-encryption = { path = "src/query/ee_features/storage_encryption" }
189191
databend-enterprise-storage-quota = { path = "src/query/ee_features/storage_quota" }
190192
databend-enterprise-stream-handler = { path = "src/query/ee_features/stream_handler" }
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
# Usage:
2+
# databend-query -c databend_query_config_spec.toml
3+
4+
[query]
5+
max_active_sessions = 256
6+
shutdown_wait_timeout_ms = 5000
7+
8+
# For flight rpc.
9+
flight_api_address = "0.0.0.0:flight_port"
10+
11+
# Databend Query http address.
12+
# For admin RESET API.
13+
admin_api_address = "0.0.0.0:admin_api_port"
14+
15+
# Databend Query metrics RESET API.
16+
metric_api_address = "0.0.0.0:metric_api_port"
17+
18+
# Databend Query MySQL Handler.
19+
mysql_handler_host = "0.0.0.0"
20+
mysql_handler_port = mysql_port
21+
22+
# Databend Query ClickHouse Handler.
23+
clickhouse_http_handler_host = "0.0.0.0"
24+
clickhouse_http_handler_port = clickhouse_port
25+
26+
# Databend Query HTTP Handler.
27+
http_handler_host = "0.0.0.0"
28+
http_handler_port = http_port
29+
30+
# Databend Query FlightSQL Handler.
31+
flight_sql_handler_host = "0.0.0.0"
32+
flight_sql_handler_port = flight_sql_port
33+
34+
tenant_id = "test_tenant"
35+
36+
table_engine_memory_enabled = true
37+
default_storage_format = 'parquet'
38+
default_compression = 'zstd'
39+
40+
[[query.users]]
41+
name = "root"
42+
auth_type = "no_password"
43+
44+
[[query.users]]
45+
name = "default"
46+
auth_type = "no_password"
47+
48+
# This for test
49+
[[query.udfs]]
50+
name = "ping"
51+
definition = "CREATE FUNCTION ping(STRING) RETURNS STRING LANGUAGE python HANDLER = 'ping' ADDRESS = 'http://0.0.0.0:8815'"
52+
53+
[query.resources_management]
54+
type = "system_managed"
55+
node_group
56+
57+
[log]
58+
59+
[log.file]
60+
level = "INFO"
61+
format = "text"
62+
dir = "./.databend/query_logs"
63+
prefix_filter = ""
64+
65+
[meta]
66+
# It is a list of `grpc_api_advertise_host:<grpc-api-port>` of databend-meta config
67+
endpoints = ["0.0.0.0:9191"]
68+
username = "root"
69+
password = "root"
70+
client_timeout_in_second = 60
71+
auto_sync_interval = 60
72+
73+
# Storage config.
74+
[storage]
75+
# fs | s3 | azblob | obs | oss
76+
type = "fs"
77+
78+
# Set a local folder to store your data.
79+
# Comment out this block if you're NOT using local file system as storage.
80+
[storage.fs]
81+
data_path = "./.databend/stateless_test_data"
82+
83+
# Cache config.
84+
[cache]
85+
# Type of storage to keep the table data cache
86+
#
87+
# available options: [none|disk]
88+
# default is "none", which disable table data cache
89+
# use "disk" to enabled disk cache
90+
data_cache_storage = "none"
91+
92+
[cache.disk]
93+
# cache path
94+
path = "./.databend/_cache"
95+
# max bytes of cached data 20G
96+
max_bytes = 21474836480

Diff for: scripts/ci/deploy/databend-query-system-managed.sh

+135
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
#!/bin/bash
2+
# Copyright 2022 The Databend Authors.
3+
# SPDX-License-Identifier: Apache-2.0.
4+
5+
set -e
6+
7+
SCRIPT_PATH="$(cd "$(dirname "$0")" >/dev/null 2>&1 && pwd)"
8+
cd "$SCRIPT_PATH/../../.." || exit
9+
BUILD_PROFILE=${BUILD_PROFILE:-debug}
10+
11+
if [ $# -eq 1 ]; then
12+
num=$1
13+
node_group=""
14+
elif [ $# -eq 2 ]; then
15+
num=$1
16+
node_group=$2
17+
else
18+
echo "Usage: $0 <number> - Start number of databend-query with system-managed mode"
19+
exit 1
20+
fi
21+
22+
if ! [[ "$num" =~ ^[0-9]*$ ]]; then
23+
echo "Error: Argument must be an integer."
24+
exit 1
25+
fi
26+
27+
# Caveat: has to kill query first.
28+
# `query` tries to remove its liveness record from meta before shutting down.
29+
# If meta is stopped, `query` will receive an error that hangs graceful
30+
# shutdown.
31+
killall databend-query || true
32+
sleep 3
33+
34+
killall databend-meta || true
35+
sleep 3
36+
37+
for bin in databend-query databend-meta; do
38+
if test -n "$(pgrep $bin)"; then
39+
echo "The $bin is not killed. force killing."
40+
killall -9 $bin || true
41+
fi
42+
done
43+
44+
# Wait for killed process to cleanup resources
45+
sleep 1
46+
47+
echo 'Start Meta service HA cluster(3 nodes)...'
48+
49+
mkdir -p ./.databend/
50+
51+
nohup ./target/${BUILD_PROFILE}/databend-meta -c scripts/ci/deploy/config/databend-meta-node-1.toml >./.databend/meta-1.out 2>&1 &
52+
python3 scripts/ci/wait_tcp.py --timeout 30 --port 9191
53+
54+
# wait for cluster formation to complete.
55+
sleep 1
56+
57+
nohup ./target/${BUILD_PROFILE}/databend-meta -c scripts/ci/deploy/config/databend-meta-node-2.toml >./.databend/meta-2.out 2>&1 &
58+
python3 scripts/ci/wait_tcp.py --timeout 30 --port 28202
59+
60+
# wait for cluster formation to complete.
61+
sleep 1
62+
63+
nohup ./target/${BUILD_PROFILE}/databend-meta -c scripts/ci/deploy/config/databend-meta-node-3.toml >./.databend/meta-3.out 2>&1 &
64+
python3 scripts/ci/wait_tcp.py --timeout 30 --port 28302
65+
66+
# wait for cluster formation to complete.
67+
sleep 1
68+
69+
find_available_port() {
70+
local base_port=20000
71+
local max_port=65535
72+
local attempts=10
73+
74+
for ((i=0; i<attempts; i++)); do
75+
port=$(( RANDOM % (max_port - base_port + 1) + base_port ))
76+
if ! lsof -i :$port >/dev/null 2>&1; then
77+
echo $port
78+
return
79+
fi
80+
done
81+
82+
echo "Unable to find an available port after $attempts attempts" >&2
83+
exit 1
84+
}
85+
86+
87+
start_databend_query() {
88+
local http_port=$1
89+
local mysql_port=$2
90+
local log_dir=$3
91+
local node_group=$4
92+
system_managed_config="./scripts/ci/deploy/config/databend-query-node-system-managed.toml"
93+
94+
temp_file=$(mktemp)
95+
96+
if [ -f "$system_managed_config" ]; then
97+
sed -e "s/flight_port/$(find_available_port)/g" \
98+
-e "s/admin_api_port/$(find_available_port)/g" \
99+
-e "s/metric_api_port/$(find_available_port)/g" \
100+
-e "s/mysql_port/${mysql_port}/g" \
101+
-e "s/clickhouse_port/$(find_available_port)/g" \
102+
-e "s/http_port/${http_port}/g" \
103+
-e "s/flight_sql_port/$(find_available_port)/g" \
104+
-e "s/query_logs/${log_dir}/g" \
105+
-e "s/node_group/node_group=\"${node_group}\"/g" \
106+
"$system_managed_config" > "$temp_file"
107+
108+
if [ $? -eq 0 ]; then
109+
echo "Start databend-query on port $http_port..."
110+
nohup target/${BUILD_PROFILE}/databend-query -c $temp_file --internal-enable-sandbox-tenant &
111+
112+
echo "Waiting on databend-query 10 seconds..."
113+
python3 scripts/ci/wait_tcp.py --timeout 30 --port $http_port
114+
else
115+
echo "Error occurred during port replacement."
116+
rm -f "$temp_file"
117+
exit 1
118+
fi
119+
else
120+
echo "Error: system-managed config file is not exists."
121+
exit 1
122+
fi
123+
}
124+
125+
if ! lsof -i :8000 >/dev/null 2>&1; then
126+
start_databend_query 8000 3307 "logs_1" $node_group
127+
num=$(( num - 1 ))
128+
fi
129+
130+
for (( i=0; i<$num; i++ ))
131+
do
132+
http_port=$(find_available_port)
133+
mysql_port=$(find_available_port)
134+
start_databend_query $http_port $mysql_port "logs_$http_port" $node_group
135+
done

Diff for: src/binaries/query/ee_main.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ pub async fn main_entrypoint() -> Result<(), MainError> {
6969
return Ok(());
7070
}
7171

72-
init_services(&conf).await.with_context(make_error)?;
72+
init_services(&conf, true).await.with_context(make_error)?;
7373
EnterpriseServices::init(conf.clone())
7474
.await
7575
.with_context(make_error)?;

Diff for: src/binaries/query/entry.rs

+4-2
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ pub async fn run_cmd(conf: &InnerConfig) -> Result<bool, MainError> {
6969
Ok(true)
7070
}
7171

72-
pub async fn init_services(conf: &InnerConfig) -> Result<(), MainError> {
72+
pub async fn init_services(conf: &InnerConfig, ee_mode: bool) -> Result<(), MainError> {
7373
let make_error = || "failed to init services";
7474

7575
let binary_version = DATABEND_COMMIT_VERSION.clone();
@@ -93,7 +93,9 @@ pub async fn init_services(conf: &InnerConfig) -> Result<(), MainError> {
9393
.with_context(make_error);
9494
}
9595
// Make sure global services have been inited.
96-
GlobalServices::init(conf).await.with_context(make_error)
96+
GlobalServices::init(conf, ee_mode)
97+
.await
98+
.with_context(make_error)
9799
}
98100

99101
async fn precheck_services(conf: &InnerConfig) -> Result<(), MainError> {

Diff for: src/binaries/query/oss_main.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ async fn main_entrypoint() -> Result<(), MainError> {
7070
return Ok(());
7171
}
7272

73-
init_services(&conf).await?;
73+
init_services(&conf, false).await?;
7474
// init oss license manager
7575
OssLicenseManager::init(conf.query.tenant_id.tenant_name().to_string())
7676
.with_context(make_error)?;

Diff for: src/binaries/tool/table_meta_inspector.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ async fn parse_input_data(config: &InspectorConfig) -> Result<Vec<u8>> {
8888
builder = builder.collect(from_file(Toml, config_file));
8989
let read_config = builder.build()?;
9090
let inner_config: InnerConfig = read_config.clone().try_into()?;
91-
GlobalServices::init(&inner_config).await?;
91+
GlobalServices::init(&inner_config, false).await?;
9292
let storage_config: StorageConfig = read_config.storage.try_into()?;
9393
init_operator(&storage_config.params)?
9494
}

Diff for: src/common/base/src/headers.rs

+1
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ pub const HEADER_QUERY_STATE: &str = "X-DATABEND-QUERY-STATE";
2525
pub const HEADER_QUERY_PAGE_ROWS: &str = "X-DATABEND-QUERY-PAGE-ROWS";
2626
pub const HEADER_VERSION: &str = "X-DATABEND-VERSION";
2727
pub const HEADER_STICKY: &str = "X-DATABEND-STICKY-NODE";
28+
pub const HEADER_WAREHOUSE: &str = "X-DATABEND-WAREHOUSE";
2829

2930
pub const HEADER_SIGNATURE: &str = "X-DATABEND-SIGNATURE";
3031
pub const HEADER_AUTH_METHOD: &str = "X-DATABEND-AUTH-METHOD";

Diff for: src/common/exception/src/exception_code.rs

+7
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,13 @@ build_exceptions! {
298298
ClusterUnknownNode(2401),
299299
ClusterNodeAlreadyExists(2402),
300300
InvalidWarehouse(2403),
301+
NoResourcesAvailable(2404),
302+
WarehouseAlreadyExists(2405),
303+
UnknownWarehouse(2406),
304+
WarehouseOperateConflict(2407),
305+
EmptyNodesForWarehouse(2408),
306+
WarehouseClusterAlreadyExists(2409),
307+
WarehouseClusterNotExists(2410),
301308

302309
// Stage error codes.
303310
UnknownStage(2501),

Diff for: src/common/license/src/license.rs

+3
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ pub enum Feature {
7373
StorageQuota(StorageQuota),
7474
#[serde(alias = "amend_table", alias = "AMEND_TABLE")]
7575
AmendTable,
76+
#[serde(alias = "system_management", alias = "SYSTEM_MANAGEMENT")]
77+
SystemManagement,
7678
#[serde(other)]
7779
Unknown,
7880
}
@@ -119,6 +121,7 @@ impl fmt::Display for Feature {
119121
write!(f, ")")
120122
}
121123
Feature::AmendTable => write!(f, "amend_table"),
124+
Feature::SystemManagement => write!(f, "system_management"),
122125
Feature::Unknown => write!(f, "unknown"),
123126
}
124127
}

0 commit comments

Comments
 (0)