Skip to content

Commit ed97884

Browse files
committed
sandbox: bugfix infinite loop in new_ttrpc_client()
Signed-off-by: Zhang Tianyang <[email protected]>
1 parent 1c73f6e commit ed97884

File tree

3 files changed

+64
-49
lines changed

3 files changed

+64
-49
lines changed

vmm/sandbox/src/client.rs

+39-31
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ use nix::{
3131
unistd::close,
3232
};
3333
use tokio::{
34-
io::{AsyncReadExt, AsyncWriteExt},
34+
io::{AsyncBufReadExt, AsyncWriteExt, BufReader},
3535
net::UnixStream,
3636
time::timeout,
3737
};
@@ -41,17 +41,17 @@ use vmm_common::api::{sandbox::*, sandbox_ttrpc::SandboxServiceClient};
4141
use crate::network::{NetworkInterface, Route};
4242

4343
const HVSOCK_RETRY_TIMEOUT_IN_MS: u64 = 10;
44+
// TODO: reduce to 10s
45+
const NEW_TTRPC_CLIENT_TIMEOUT: u64 = 45;
4446
const TIME_SYNC_PERIOD: u64 = 60;
4547
const TIME_DIFF_TOLERANCE_IN_MS: u64 = 10;
4648

4749
pub(crate) async fn new_sandbox_client(address: &str) -> Result<SandboxServiceClient> {
48-
let client = new_ttrpc_client(address).await?;
50+
let client = new_ttrpc_client_with_timeout(address, NEW_TTRPC_CLIENT_TIMEOUT).await?;
4951
Ok(SandboxServiceClient::new(client))
5052
}
5153

52-
async fn new_ttrpc_client(address: &str) -> Result<Client> {
53-
let ctx_timeout = 10;
54-
54+
async fn new_ttrpc_client_with_timeout(address: &str, t: u64) -> Result<Client> {
5555
let mut last_err = Error::Other(anyhow!(""));
5656

5757
let fut = async {
@@ -63,12 +63,17 @@ async fn new_ttrpc_client(address: &str) -> Result<Client> {
6363
}
6464
Err(e) => last_err = e,
6565
}
66+
// In case that the address doesn't exist, the executed function in this loop are all
67+
// sync, making the first time of future poll in timeout hang forever. As a result, the
68+
// timeout will hang too. To solve this, add a async function in this loop or call
69+
// `tokio::task::yield_now()` to give up current cpu time slice.
70+
tokio::time::sleep(Duration::from_millis(10)).await;
6671
}
6772
};
6873

69-
let client = timeout(Duration::from_secs(ctx_timeout), fut)
74+
let client = timeout(Duration::from_secs(t), fut)
7075
.await
71-
.map_err(|_| anyhow!("{}s timeout connecting socket: {}", ctx_timeout, last_err))?;
76+
.map_err(|_| anyhow!("{}s timeout connecting socket: {}", t, last_err))?;
7277
Ok(client)
7378
}
7479

@@ -167,31 +172,21 @@ async fn connect_to_hvsocket(address: &str) -> Result<RawFd> {
167172

168173
let fut = async {
169174
let mut stream = UnixStream::connect(addr).await?;
170-
171-
match stream.write(format!("CONNECT {}\n", port).as_bytes()).await {
172-
Ok(_) => {
173-
let mut buf = [0; 4096];
174-
match stream.read(&mut buf).await {
175-
Ok(0) => Err(anyhow!("stream closed")),
176-
Ok(n) => {
177-
if String::from_utf8(buf[..n].to_vec())
178-
.unwrap_or_default()
179-
.contains("OK")
180-
{
181-
return Ok(stream.into_std()?.into_raw_fd());
182-
}
183-
Err(anyhow!("failed to connect"))
184-
}
185-
Err(ref e) if e.kind() == std::io::ErrorKind::WouldBlock => {
186-
Err(anyhow!("{}", e))
187-
}
188-
Err(e) => Err(anyhow!("failed to read from hvsock: {}", e)),
189-
}
190-
}
191-
Err(ref e) if e.kind() == std::io::ErrorKind::WouldBlock => Err(anyhow!("{}", e)),
192-
Err(e) => Err(anyhow!("failed to write CONNECT to hvsock: {}", e)),
175+
stream
176+
.write_all(format!("CONNECT {}\n", port).as_bytes())
177+
.await
178+
.map_err(|e| anyhow!("hvsock connected but failed to write CONNECT: {}", e))?;
179+
180+
let mut response = String::new();
181+
BufReader::new(&mut stream)
182+
.read_line(&mut response)
183+
.await
184+
.map_err(|e| anyhow!("CONNECT sent but failed to get response: {}", e))?;
185+
if response.starts_with("OK") {
186+
Ok(stream.into_std()?.into_raw_fd())
187+
} else {
188+
Err(anyhow!("CONNECT sent but response is not OK: {}", response).into())
193189
}
194-
.map_err(Error::Other)
195190
};
196191

197192
timeout(Duration::from_millis(HVSOCK_RETRY_TIMEOUT_IN_MS), fut)
@@ -312,3 +307,16 @@ pub(crate) async fn client_sync_clock(client: &SandboxServiceClient, id: &str) {
312307
}
313308
});
314309
}
310+
311+
#[cfg(test)]
312+
mod tests {
313+
use crate::client::new_ttrpc_client_with_timeout;
314+
315+
#[tokio::test]
316+
async fn test_new_ttrpc_client_timeout() {
317+
// Expect new_ttrpc_client would return timeout error, instead of blocking.
318+
assert!(new_ttrpc_client_with_timeout("hvsock://fake.sock:1024", 1)
319+
.await
320+
.is_err());
321+
}
322+
}

vmm/sandbox/src/cloud_hypervisor/client.rs

+24-17
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,15 @@ limitations under the License.
1616

1717
use std::{
1818
os::unix::net::UnixStream,
19-
path::Path,
2019
thread::sleep,
2120
time::{Duration, SystemTime},
2221
};
2322

2423
use anyhow::anyhow;
2524
use api_client::{simple_api_command, simple_api_full_command_with_fds_and_response};
2625
use containerd_sandbox::error::Result;
27-
use log::{error, trace};
26+
use log::{debug, error, trace};
27+
use tokio::task::spawn_blocking;
2828

2929
use crate::{
3030
cloud_hypervisor::devices::{block::DiskConfig, AddDeviceResponse, RemoveDeviceRequest},
@@ -38,25 +38,32 @@ pub struct ChClient {
3838
}
3939

4040
impl ChClient {
41-
pub fn new<P: AsRef<Path>>(socket_path: P) -> Result<Self> {
41+
pub async fn new(socket_path: String) -> Result<Self> {
42+
let s = socket_path.to_string();
4243
let start_time = SystemTime::now();
43-
loop {
44-
match UnixStream::connect(&socket_path) {
45-
Ok(socket) => {
46-
return Ok(Self { socket });
47-
}
48-
Err(e) => {
49-
trace!("failed to create client: {:?}", e);
50-
if start_time.elapsed().unwrap().as_secs()
51-
> CLOUD_HYPERVISOR_START_TIMEOUT_IN_SEC
52-
{
53-
error!("failed to create client: {:?}", e);
54-
return Err(anyhow!("timeout connect client, {}", e).into());
44+
let socket = spawn_blocking(move || -> Result<UnixStream> {
45+
loop {
46+
match UnixStream::connect(&socket_path) {
47+
Ok(socket) => {
48+
return Ok(socket);
49+
}
50+
Err(e) => {
51+
trace!("failed to create client: {:?}", e);
52+
if start_time.elapsed().unwrap().as_secs()
53+
> CLOUD_HYPERVISOR_START_TIMEOUT_IN_SEC
54+
{
55+
error!("failed to create client: {:?}", e);
56+
return Err(anyhow!("timeout connect client, {}", e).into());
57+
}
58+
sleep(Duration::from_millis(10));
5559
}
56-
sleep(Duration::from_millis(10));
5760
}
5861
}
59-
}
62+
})
63+
.await
64+
.map_err(|e| anyhow!("failed to join thread {}", e))??;
65+
debug!("connected to api server {}", s);
66+
Ok(Self { socket })
6067
}
6168

6269
pub fn hot_attach(&mut self, device_info: DeviceInfo) -> Result<String> {

vmm/sandbox/src/cloud_hypervisor/mod.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ impl CloudHypervisorVM {
113113
}
114114

115115
async fn create_client(&self) -> Result<ChClient> {
116-
ChClient::new(&self.config.api_socket)
116+
ChClient::new(self.config.api_socket.to_string()).await
117117
}
118118

119119
fn get_client(&mut self) -> Result<&mut ChClient> {

0 commit comments

Comments
 (0)