Skip to content

Commit dc9f058

Browse files
authored
Merge pull request #119 from Burning1020/fix-loop-ttrpc
sandbox: bugfix infinite loop in new_ttrpc_client()
2 parents 0ecff42 + 1ed7212 commit dc9f058

File tree

2 files changed

+64
-39
lines changed

2 files changed

+64
-39
lines changed

vmm/sandbox/src/client.rs

+43-25
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,7 @@ limitations under the License.
1515
*/
1616

1717
use std::{
18-
io::{BufRead, BufReader, Write},
19-
os::unix::{
20-
io::{IntoRawFd, RawFd},
21-
net::UnixStream,
22-
},
18+
os::unix::io::{IntoRawFd, RawFd},
2319
time::Duration,
2420
};
2521

@@ -34,23 +30,28 @@ use nix::{
3430
time::{clock_gettime, ClockId},
3531
unistd::close,
3632
};
37-
use tokio::time::timeout;
33+
use tokio::{
34+
io::{AsyncBufReadExt, AsyncWriteExt, BufReader},
35+
net::UnixStream,
36+
time::timeout,
37+
};
3838
use ttrpc::{context::with_timeout, r#async::Client};
3939
use vmm_common::api::{sandbox::*, sandbox_ttrpc::SandboxServiceClient};
4040

4141
use crate::network::{NetworkInterface, Route};
4242

43+
const HVSOCK_RETRY_TIMEOUT_IN_MS: u64 = 10;
44+
// TODO: reduce to 10s
45+
const NEW_TTRPC_CLIENT_TIMEOUT: u64 = 45;
4346
const TIME_SYNC_PERIOD: u64 = 60;
4447
const TIME_DIFF_TOLERANCE_IN_MS: u64 = 10;
4548

4649
pub(crate) async fn new_sandbox_client(address: &str) -> Result<SandboxServiceClient> {
47-
let client = new_ttrpc_client(address).await?;
50+
let client = new_ttrpc_client_with_timeout(address, NEW_TTRPC_CLIENT_TIMEOUT).await?;
4851
Ok(SandboxServiceClient::new(client))
4952
}
5053

51-
async fn new_ttrpc_client(address: &str) -> Result<Client> {
52-
let ctx_timeout = 10;
53-
54+
async fn new_ttrpc_client_with_timeout(address: &str, t: u64) -> Result<Client> {
5455
let mut last_err = Error::Other(anyhow!(""));
5556

5657
let fut = async {
@@ -62,16 +63,17 @@ async fn new_ttrpc_client(address: &str) -> Result<Client> {
6263
}
6364
Err(e) => last_err = e,
6465
}
66+
// In case that the address doesn't exist, the executed function in this loop are all
67+
// sync, making the first time of future poll in timeout hang forever. As a result, the
68+
// timeout will hang too. To solve this, add a async function in this loop or call
69+
// `tokio::task::yield_now()` to give up current cpu time slice.
70+
tokio::time::sleep(Duration::from_millis(10)).await;
6571
}
6672
};
6773

68-
let client = timeout(Duration::from_secs(ctx_timeout), fut)
74+
let client = timeout(Duration::from_secs(t), fut)
6975
.await
70-
.map_err(|_| {
71-
let e = anyhow!("{}s timeout connecting socket: {}", ctx_timeout, last_err);
72-
error!("{}", e);
73-
e
74-
})?;
76+
.map_err(|_| anyhow!("{}s timeout connecting socket: {}", t, last_err))?;
7577
Ok(client)
7678
}
7779

@@ -165,28 +167,31 @@ async fn connect_to_hvsocket(address: &str) -> Result<RawFd> {
165167
if v.len() < 2 {
166168
return Err(anyhow!("hvsock address {} should not less than 2", address).into());
167169
}
168-
(v[0].to_string(), v[1].to_string())
170+
(v[0], v[1])
169171
};
170172

171-
tokio::task::spawn_blocking(move || {
172-
let mut stream =
173-
UnixStream::connect(&addr).map_err(|e| anyhow!("failed to connect hvsock: {}", e))?;
173+
let fut = async {
174+
let mut stream = UnixStream::connect(addr).await?;
174175
stream
175176
.write_all(format!("CONNECT {}\n", port).as_bytes())
177+
.await
176178
.map_err(|e| anyhow!("hvsock connected but failed to write CONNECT: {}", e))?;
177179

178180
let mut response = String::new();
179-
BufReader::new(&stream)
181+
BufReader::new(&mut stream)
180182
.read_line(&mut response)
183+
.await
181184
.map_err(|e| anyhow!("CONNECT sent but failed to get response: {}", e))?;
182185
if response.starts_with("OK") {
183-
Ok(stream.into_raw_fd())
186+
Ok(stream.into_std()?.into_raw_fd())
184187
} else {
185188
Err(anyhow!("CONNECT sent but response is not OK: {}", response).into())
186189
}
187-
})
188-
.await
189-
.map_err(|e| anyhow!("failed to spawn blocking task: {}", e))?
190+
};
191+
192+
timeout(Duration::from_millis(HVSOCK_RETRY_TIMEOUT_IN_MS), fut)
193+
.await
194+
.map_err(|_| anyhow!("hvsock retry {}ms timeout", HVSOCK_RETRY_TIMEOUT_IN_MS))?
190195
}
191196

192197
pub fn unix_sock(r#abstract: bool, socket_path: &str) -> Result<UnixAddr> {
@@ -302,3 +307,16 @@ pub(crate) async fn client_sync_clock(client: &SandboxServiceClient, id: &str) {
302307
}
303308
});
304309
}
310+
311+
#[cfg(test)]
312+
mod tests {
313+
use crate::client::new_ttrpc_client_with_timeout;
314+
315+
#[tokio::test]
316+
async fn test_new_ttrpc_client_timeout() {
317+
// Expect new_ttrpc_client would return timeout error, instead of blocking.
318+
assert!(new_ttrpc_client_with_timeout("hvsock://fake.sock:1024", 1)
319+
.await
320+
.is_err());
321+
}
322+
}

vmm/sandbox/src/cloud_hypervisor/client.rs

+21-14
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ use std::{
2323
use anyhow::anyhow;
2424
use api_client::{simple_api_command, simple_api_full_command_with_fds_and_response};
2525
use containerd_sandbox::error::Result;
26-
use log::error;
26+
use log::{debug, error, trace};
27+
use tokio::task::spawn_blocking;
2728

2829
use crate::{
2930
cloud_hypervisor::devices::{block::DiskConfig, AddDeviceResponse, RemoveDeviceRequest},
@@ -38,25 +39,31 @@ pub struct ChClient {
3839

3940
impl ChClient {
4041
pub async fn new(socket_path: String) -> Result<Self> {
42+
let s = socket_path.to_string();
4143
let start_time = SystemTime::now();
42-
tokio::task::spawn_blocking(move || loop {
43-
match UnixStream::connect(&socket_path) {
44-
Ok(socket) => {
45-
return Ok(Self { socket });
46-
}
47-
Err(e) => {
48-
if start_time.elapsed().unwrap().as_secs()
49-
> CLOUD_HYPERVISOR_START_TIMEOUT_IN_SEC
50-
{
51-
error!("failed to connect api server: {:?}", e);
52-
return Err(anyhow!("timeout connect client, {}", e).into());
44+
let socket = spawn_blocking(move || -> Result<UnixStream> {
45+
loop {
46+
match UnixStream::connect(&socket_path) {
47+
Ok(socket) => {
48+
return Ok(socket);
49+
}
50+
Err(e) => {
51+
trace!("failed to create client: {:?}", e);
52+
if start_time.elapsed().unwrap().as_secs()
53+
> CLOUD_HYPERVISOR_START_TIMEOUT_IN_SEC
54+
{
55+
error!("failed to create client: {:?}", e);
56+
return Err(anyhow!("timeout connect client, {}", e).into());
57+
}
58+
sleep(Duration::from_millis(10));
5359
}
54-
sleep(Duration::from_millis(10));
5560
}
5661
}
5762
})
5863
.await
59-
.map_err(|e| anyhow!("failed to spawn a task {}", e))?
64+
.map_err(|e| anyhow!("failed to join thread {}", e))??;
65+
debug!("connected to api server {}", s);
66+
Ok(Self { socket })
6067
}
6168

6269
pub fn hot_attach(&mut self, device_info: DeviceInfo) -> Result<String> {

0 commit comments

Comments
 (0)