forked from wrenn/wrenn
feat(envd): update guest agent for Cloud Hypervisor
Remove Firecracker-specific MMDS metadata fetching and metrics host module. CH communicates with the guest purely over TAP networking, so MMDS (Firecracker's metadata service via MMDS address) is no longer needed. - Remove src/host/ module (mmds.rs, metrics.rs) - Remove reqwest dependency (was only used for MMDS HTTP calls) - Remove --isnotfc CLI flag (no longer dual-mode) - Simplify health endpoint and init handler - Update state management for CH snapshot lifecycle - Bump version to 0.3.0
This commit is contained in:
@ -9,8 +9,3 @@ pub const WRENN_RUN_DIR: &str = "/run/wrenn";
|
||||
|
||||
pub const KILOBYTE: u64 = 1024;
|
||||
pub const MEGABYTE: u64 = 1024 * KILOBYTE;
|
||||
|
||||
pub const MMDS_ADDRESS: &str = "169.254.169.254";
|
||||
pub const MMDS_POLL_INTERVAL: Duration = Duration::from_millis(50);
|
||||
pub const MMDS_TOKEN_EXPIRATION_SECS: u64 = 60;
|
||||
pub const MMDS_ACCESS_TOKEN_CLIENT_TIMEOUT: Duration = Duration::from_secs(10);
|
||||
|
||||
@ -1,73 +0,0 @@
|
||||
use std::ffi::CString;
|
||||
use std::time::{SystemTime, UNIX_EPOCH};
|
||||
|
||||
use serde::Serialize;
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct Metrics {
|
||||
pub ts: i64,
|
||||
pub cpu_count: u32,
|
||||
pub cpu_used_pct: f32,
|
||||
pub mem_total_mib: u64,
|
||||
pub mem_used_mib: u64,
|
||||
pub mem_total: u64,
|
||||
pub mem_used: u64,
|
||||
pub disk_used: u64,
|
||||
pub disk_total: u64,
|
||||
}
|
||||
|
||||
pub fn get_metrics() -> Result<Metrics, String> {
|
||||
use sysinfo::System;
|
||||
|
||||
let mut sys = System::new();
|
||||
sys.refresh_memory();
|
||||
sys.refresh_cpu_all();
|
||||
|
||||
std::thread::sleep(std::time::Duration::from_millis(100));
|
||||
sys.refresh_cpu_all();
|
||||
|
||||
let cpu_count = sys.cpus().len() as u32;
|
||||
let cpu_used_pct = sys.global_cpu_usage();
|
||||
let cpu_used_pct_rounded = if cpu_used_pct > 0.0 {
|
||||
(cpu_used_pct * 100.0).round() / 100.0
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
let mem_total = sys.total_memory();
|
||||
let mem_used = sys.used_memory();
|
||||
|
||||
let (disk_total, disk_used) = disk_stats("/")?;
|
||||
|
||||
let ts = SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_secs() as i64;
|
||||
|
||||
Ok(Metrics {
|
||||
ts,
|
||||
cpu_count,
|
||||
cpu_used_pct: cpu_used_pct_rounded,
|
||||
mem_total_mib: mem_total / 1024 / 1024,
|
||||
mem_used_mib: mem_used / 1024 / 1024,
|
||||
mem_total,
|
||||
mem_used,
|
||||
disk_used,
|
||||
disk_total,
|
||||
})
|
||||
}
|
||||
|
||||
fn disk_stats(path: &str) -> Result<(u64, u64), String> {
|
||||
let c_path = CString::new(path).unwrap();
|
||||
let mut stat: libc::statfs = unsafe { std::mem::zeroed() };
|
||||
let ret = unsafe { libc::statfs(c_path.as_ptr(), &mut stat) };
|
||||
if ret != 0 {
|
||||
return Err(format!("statfs failed: {}", std::io::Error::last_os_error()));
|
||||
}
|
||||
|
||||
let block = stat.f_bsize as u64;
|
||||
let total = stat.f_blocks * block;
|
||||
let available = stat.f_bavail * block;
|
||||
|
||||
Ok((total, total - available))
|
||||
}
|
||||
@ -1,120 +0,0 @@
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use dashmap::DashMap;
|
||||
use serde::Deserialize;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use crate::config::{MMDS_ADDRESS, MMDS_POLL_INTERVAL, MMDS_TOKEN_EXPIRATION_SECS, WRENN_RUN_DIR};
|
||||
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
pub struct MMDSOpts {
|
||||
#[serde(rename = "instanceID")]
|
||||
pub sandbox_id: String,
|
||||
#[serde(rename = "envID")]
|
||||
pub template_id: String,
|
||||
#[serde(rename = "address", default)]
|
||||
pub logs_collector_address: String,
|
||||
#[serde(rename = "accessTokenHash", default)]
|
||||
pub access_token_hash: String,
|
||||
}
|
||||
|
||||
async fn get_mmds_token(client: &reqwest::Client) -> Result<String, String> {
|
||||
let resp = client
|
||||
.put(format!("http://{MMDS_ADDRESS}/latest/api/token"))
|
||||
.header(
|
||||
"X-metadata-token-ttl-seconds",
|
||||
MMDS_TOKEN_EXPIRATION_SECS.to_string(),
|
||||
)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| format!("mmds token request failed: {e}"))?;
|
||||
|
||||
let token = resp.text().await.map_err(|e| format!("mmds token read: {e}"))?;
|
||||
if token.is_empty() {
|
||||
return Err("mmds token is an empty string".into());
|
||||
}
|
||||
Ok(token)
|
||||
}
|
||||
|
||||
async fn get_mmds_opts(client: &reqwest::Client, token: &str) -> Result<MMDSOpts, String> {
|
||||
let resp = client
|
||||
.get(format!("http://{MMDS_ADDRESS}"))
|
||||
.header("X-metadata-token", token)
|
||||
.header("Accept", "application/json")
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| format!("mmds opts request failed: {e}"))?;
|
||||
|
||||
resp.json::<MMDSOpts>()
|
||||
.await
|
||||
.map_err(|e| format!("mmds opts parse: {e}"))
|
||||
}
|
||||
|
||||
pub async fn get_access_token_hash() -> Result<String, String> {
|
||||
let client = reqwest::Client::builder()
|
||||
.timeout(Duration::from_secs(10))
|
||||
.no_proxy()
|
||||
.build()
|
||||
.map_err(|e| format!("http client: {e}"))?;
|
||||
|
||||
let token = get_mmds_token(&client).await?;
|
||||
let opts = get_mmds_opts(&client, &token).await?;
|
||||
Ok(opts.access_token_hash)
|
||||
}
|
||||
|
||||
/// Polls MMDS every 50ms until metadata is available.
|
||||
/// Stores sandbox_id and template_id in env_vars and writes to /run/wrenn/ files.
|
||||
pub async fn poll_for_opts(
|
||||
env_vars: Arc<DashMap<String, String>>,
|
||||
cancel: CancellationToken,
|
||||
) -> Option<MMDSOpts> {
|
||||
let client = reqwest::Client::builder()
|
||||
.no_proxy()
|
||||
.build()
|
||||
.ok()?;
|
||||
|
||||
let mut interval = tokio::time::interval(MMDS_POLL_INTERVAL);
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {
|
||||
tracing::warn!("context cancelled while waiting for mmds opts");
|
||||
return None;
|
||||
}
|
||||
_ = interval.tick() => {
|
||||
let token = match get_mmds_token(&client).await {
|
||||
Ok(t) => t,
|
||||
Err(e) => {
|
||||
tracing::debug!(error = %e, "mmds token poll");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let opts = match get_mmds_opts(&client, &token).await {
|
||||
Ok(o) => o,
|
||||
Err(e) => {
|
||||
tracing::debug!(error = %e, "mmds opts poll");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
env_vars.insert("WRENN_SANDBOX_ID".into(), opts.sandbox_id.clone());
|
||||
env_vars.insert("WRENN_TEMPLATE_ID".into(), opts.template_id.clone());
|
||||
|
||||
let run_dir = std::path::Path::new(WRENN_RUN_DIR);
|
||||
if let Err(e) = std::fs::create_dir_all(run_dir) {
|
||||
tracing::error!(error = %e, "mmds: failed to create run dir");
|
||||
}
|
||||
if let Err(e) = std::fs::write(run_dir.join(".WRENN_SANDBOX_ID"), &opts.sandbox_id) {
|
||||
tracing::error!(error = %e, "mmds: failed to write .WRENN_SANDBOX_ID");
|
||||
}
|
||||
if let Err(e) = std::fs::write(run_dir.join(".WRENN_TEMPLATE_ID"), &opts.template_id) {
|
||||
tracing::error!(error = %e, "mmds: failed to write .WRENN_TEMPLATE_ID");
|
||||
}
|
||||
|
||||
return Some(opts);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1,2 +0,0 @@
|
||||
pub mod metrics;
|
||||
pub mod mmds;
|
||||
@ -1,5 +1,4 @@
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::Ordering;
|
||||
|
||||
use axum::Json;
|
||||
use axum::extract::State;
|
||||
@ -10,13 +9,7 @@ use serde_json::json;
|
||||
use crate::state::AppState;
|
||||
|
||||
pub async fn get_health(State(state): State<Arc<AppState>>) -> impl IntoResponse {
|
||||
if state
|
||||
.needs_restore
|
||||
.compare_exchange(true, false, Ordering::AcqRel, Ordering::Relaxed)
|
||||
.is_ok()
|
||||
{
|
||||
post_restore_recovery(&state);
|
||||
}
|
||||
state.try_restore_recovery();
|
||||
|
||||
tracing::trace!("health check");
|
||||
|
||||
@ -25,17 +18,3 @@ pub async fn get_health(State(state): State<Arc<AppState>>) -> impl IntoResponse
|
||||
Json(json!({ "version": state.version })),
|
||||
)
|
||||
}
|
||||
|
||||
fn post_restore_recovery(state: &AppState) {
|
||||
tracing::info!("restore: post-restore recovery (no GC needed in Rust)");
|
||||
|
||||
state.snapshot_in_progress.store(false, std::sync::atomic::Ordering::Release);
|
||||
|
||||
state.conn_tracker.restore_after_snapshot();
|
||||
tracing::info!("restore: zombie connections closed");
|
||||
|
||||
if let Some(ref ps) = state.port_subsystem {
|
||||
ps.restart();
|
||||
tracing::info!("restore: port subsystem restarted");
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::Ordering;
|
||||
|
||||
use axum::Json;
|
||||
use axum::extract::State;
|
||||
@ -8,20 +7,25 @@ use axum::http::{StatusCode, header};
|
||||
use axum::response::IntoResponse;
|
||||
use serde::Deserialize;
|
||||
|
||||
use crate::crypto;
|
||||
use crate::host::mmds;
|
||||
use crate::state::AppState;
|
||||
|
||||
#[derive(Deserialize, Default)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct InitRequest {
|
||||
#[serde(rename = "access_token")]
|
||||
pub access_token: Option<String>,
|
||||
#[serde(rename = "defaultUser")]
|
||||
pub default_user: Option<String>,
|
||||
#[serde(rename = "defaultWorkdir")]
|
||||
pub default_workdir: Option<String>,
|
||||
#[serde(rename = "envVars")]
|
||||
pub env_vars: Option<HashMap<String, String>>,
|
||||
#[serde(rename = "hyperloop_ip")]
|
||||
pub hyperloop_ip: Option<String>,
|
||||
pub timestamp: Option<String>,
|
||||
#[serde(rename = "volume_mounts")]
|
||||
pub volume_mounts: Option<Vec<VolumeMount>>,
|
||||
pub sandbox_id: Option<String>,
|
||||
pub template_id: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
@ -110,37 +114,27 @@ pub async fn post_init(
|
||||
}
|
||||
}
|
||||
|
||||
// Re-poll MMDS in background
|
||||
if state.is_fc {
|
||||
let env_vars = Arc::clone(&state.defaults.env_vars);
|
||||
let cancel = tokio_util::sync::CancellationToken::new();
|
||||
let cancel_clone = cancel.clone();
|
||||
tokio::spawn(async move {
|
||||
tokio::time::timeout(std::time::Duration::from_secs(60), async {
|
||||
mmds::poll_for_opts(env_vars, cancel_clone).await;
|
||||
})
|
||||
.await
|
||||
.ok();
|
||||
});
|
||||
// Set sandbox/template metadata from request body.
|
||||
if let Some(ref id) = init_req.sandbox_id {
|
||||
tracing::debug!(sandbox_id = %id, "setting sandbox ID from init request");
|
||||
// SAFETY: envd is single-threaded at init time; no concurrent env reads.
|
||||
unsafe { std::env::set_var("WRENN_SANDBOX_ID", id) };
|
||||
write_run_file(".WRENN_SANDBOX_ID", id);
|
||||
state.defaults.env_vars.insert("WRENN_SANDBOX_ID".into(), id.clone());
|
||||
}
|
||||
if let Some(ref id) = init_req.template_id {
|
||||
tracing::debug!(template_id = %id, "setting template ID from init request");
|
||||
// SAFETY: envd is single-threaded at init time; no concurrent env reads.
|
||||
unsafe { std::env::set_var("WRENN_TEMPLATE_ID", id) };
|
||||
write_run_file(".WRENN_TEMPLATE_ID", id);
|
||||
state.defaults.env_vars.insert("WRENN_TEMPLATE_ID".into(), id.clone());
|
||||
}
|
||||
|
||||
trigger_restore_and_respond(&state).await
|
||||
}
|
||||
|
||||
async fn trigger_restore_and_respond(state: &AppState) -> axum::response::Response {
|
||||
// Safety net: if health check's postRestoreRecovery hasn't run yet
|
||||
if state
|
||||
.needs_restore
|
||||
.compare_exchange(true, false, Ordering::AcqRel, Ordering::Relaxed)
|
||||
.is_ok()
|
||||
{
|
||||
post_restore_recovery(state);
|
||||
}
|
||||
|
||||
state.conn_tracker.restore_after_snapshot();
|
||||
if let Some(ref ps) = state.port_subsystem {
|
||||
ps.restart();
|
||||
}
|
||||
state.try_restore_recovery();
|
||||
|
||||
(
|
||||
StatusCode::NO_CONTENT,
|
||||
@ -149,46 +143,13 @@ async fn trigger_restore_and_respond(state: &AppState) -> axum::response::Respon
|
||||
.into_response()
|
||||
}
|
||||
|
||||
fn post_restore_recovery(state: &AppState) {
|
||||
tracing::info!("restore: post-restore recovery (no GC needed in Rust)");
|
||||
|
||||
state.snapshot_in_progress.store(false, std::sync::atomic::Ordering::Release);
|
||||
|
||||
state.conn_tracker.restore_after_snapshot();
|
||||
|
||||
if let Some(ref ps) = state.port_subsystem {
|
||||
ps.restart();
|
||||
tracing::info!("restore: port subsystem restarted");
|
||||
}
|
||||
}
|
||||
|
||||
async fn validate_init_access_token(state: &AppState, request_token: &str) -> Result<(), String> {
|
||||
// Fast path: matches existing token
|
||||
if state.access_token.is_set() && !request_token.is_empty() && state.access_token.equals(request_token) {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Check MMDS hash
|
||||
if state.is_fc {
|
||||
if let Ok(mmds_hash) = mmds::get_access_token_hash().await {
|
||||
if !mmds_hash.is_empty() {
|
||||
if request_token.is_empty() {
|
||||
let empty_hash = crypto::sha512::hash_access_token("");
|
||||
if mmds_hash == empty_hash {
|
||||
return Ok(());
|
||||
}
|
||||
} else {
|
||||
let token_hash = crypto::sha512::hash_access_token(request_token);
|
||||
if mmds_hash == token_hash {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
return Err("access token validation failed".into());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// First-time setup: no existing token and no MMDS
|
||||
// First-time setup: no existing token
|
||||
if !state.access_token.is_set() {
|
||||
return Ok(());
|
||||
}
|
||||
@ -268,14 +229,21 @@ async fn setup_nfs(nfs_target: &str, path: &str) {
|
||||
}
|
||||
}
|
||||
|
||||
fn write_run_file(name: &str, value: &str) {
|
||||
let dir = std::path::Path::new("/run/wrenn");
|
||||
if let Err(e) = std::fs::create_dir_all(dir) {
|
||||
tracing::warn!(error = %e, "failed to create /run/wrenn");
|
||||
return;
|
||||
}
|
||||
if let Err(e) = std::fs::write(dir.join(name), value) {
|
||||
tracing::warn!(error = %e, name, "failed to write run file");
|
||||
}
|
||||
}
|
||||
|
||||
fn chrono_parse_to_nanos(ts: &str) -> Result<i64, ()> {
|
||||
// Parse RFC3339 timestamp to nanoseconds since epoch
|
||||
// Simple approach: parse as seconds + fractional
|
||||
let secs = ts.parse::<f64>().ok();
|
||||
if let Some(s) = secs {
|
||||
return Ok((s * 1_000_000_000.0) as i64);
|
||||
}
|
||||
// Try RFC3339 format
|
||||
// For now, fall back to allowing the update
|
||||
Err(())
|
||||
}
|
||||
|
||||
@ -7,7 +7,7 @@ use axum::response::IntoResponse;
|
||||
|
||||
use crate::state::AppState;
|
||||
|
||||
/// POST /snapshot/prepare — quiesce subsystems before Firecracker snapshot.
|
||||
/// POST /snapshot/prepare — quiesce subsystems before VM snapshot.
|
||||
///
|
||||
/// In Rust there is no GC dance. We just:
|
||||
/// 1. Drop page cache to shrink snapshot size
|
||||
|
||||
@ -6,7 +6,6 @@ mod config;
|
||||
mod conntracker;
|
||||
mod crypto;
|
||||
mod execcontext;
|
||||
mod host;
|
||||
mod http;
|
||||
mod logging;
|
||||
mod permissions;
|
||||
@ -22,7 +21,6 @@ use std::sync::Arc;
|
||||
|
||||
use clap::Parser;
|
||||
use tokio::net::TcpListener;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use config::{DEFAULT_PORT, DEFAULT_USER, WRENN_RUN_DIR};
|
||||
use execcontext::Defaults;
|
||||
@ -44,9 +42,6 @@ struct Cli {
|
||||
#[arg(long, default_value_t = DEFAULT_PORT)]
|
||||
port: u16,
|
||||
|
||||
#[arg(long = "isnotfc", default_value_t = false)]
|
||||
is_not_fc: bool,
|
||||
|
||||
#[arg(long)]
|
||||
version: bool,
|
||||
|
||||
@ -73,35 +68,22 @@ async fn main() {
|
||||
return;
|
||||
}
|
||||
|
||||
let use_json = !cli.is_not_fc;
|
||||
logging::init(use_json);
|
||||
logging::init(true);
|
||||
|
||||
if let Err(e) = fs::create_dir_all(WRENN_RUN_DIR) {
|
||||
tracing::error!(error = %e, "failed to create wrenn run directory");
|
||||
}
|
||||
|
||||
let defaults = Defaults::new(DEFAULT_USER);
|
||||
let is_fc_str = if cli.is_not_fc { "false" } else { "true" };
|
||||
defaults
|
||||
.env_vars
|
||||
.insert("WRENN_SANDBOX".into(), is_fc_str.into());
|
||||
.insert("WRENN_SANDBOX".into(), "true".into());
|
||||
|
||||
let wrenn_sandbox_path = Path::new(WRENN_RUN_DIR).join(".WRENN_SANDBOX");
|
||||
if let Err(e) = fs::write(&wrenn_sandbox_path, is_fc_str.as_bytes()) {
|
||||
if let Err(e) = fs::write(&wrenn_sandbox_path, b"true") {
|
||||
tracing::error!(error = %e, "failed to write sandbox file");
|
||||
}
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
// MMDS polling (only in FC mode)
|
||||
if !cli.is_not_fc {
|
||||
let env_vars = Arc::clone(&defaults.env_vars);
|
||||
let cancel_clone = cancel.clone();
|
||||
tokio::spawn(async move {
|
||||
host::mmds::poll_for_opts(env_vars, cancel_clone).await;
|
||||
});
|
||||
}
|
||||
|
||||
// Cgroup manager
|
||||
let cgroup_manager: Arc<dyn cgroups::CgroupManager> =
|
||||
match cgroups::Cgroup2Manager::new(
|
||||
@ -143,14 +125,13 @@ async fn main() {
|
||||
defaults,
|
||||
VERSION.to_string(),
|
||||
COMMIT.to_string(),
|
||||
!cli.is_not_fc,
|
||||
Some(Arc::clone(&port_subsystem)),
|
||||
);
|
||||
|
||||
// Memory reclaimer — drop page cache when available memory is low.
|
||||
// Firecracker balloon device can only reclaim pages the guest kernel freed.
|
||||
// The balloon device can only reclaim pages the guest kernel freed.
|
||||
// Pauses during snapshot/prepare to avoid corrupting kernel page table state.
|
||||
if !cli.is_not_fc {
|
||||
{
|
||||
let state_for_reclaimer = Arc::clone(&state);
|
||||
std::thread::spawn(move || memory_reclaimer(state_for_reclaimer));
|
||||
}
|
||||
@ -188,7 +169,6 @@ async fn main() {
|
||||
}
|
||||
|
||||
port_subsystem.stop();
|
||||
cancel.cancel();
|
||||
}
|
||||
|
||||
fn spawn_initial_command(cmd: &str, state: &AppState) {
|
||||
@ -233,9 +213,11 @@ fn spawn_initial_command(cmd: &str, state: &AppState) {
|
||||
|
||||
fn memory_reclaimer(state: Arc<AppState>) {
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::time::{Duration, SystemTime, UNIX_EPOCH};
|
||||
|
||||
const CHECK_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
|
||||
const CHECK_INTERVAL: Duration = Duration::from_secs(10);
|
||||
const DROP_THRESHOLD_PCT: u64 = 80;
|
||||
const RESTORE_GRACE_SECS: u64 = 30;
|
||||
|
||||
loop {
|
||||
std::thread::sleep(CHECK_INTERVAL);
|
||||
@ -244,6 +226,20 @@ fn memory_reclaimer(state: Arc<AppState>) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip during post-restore grace period. Balloon deflation causes
|
||||
// transient high memory that resolves on its own — triggering
|
||||
// drop_caches during UFFD page fault storms makes the guest unresponsive.
|
||||
let restore_epoch = state.restore_epoch.load(Ordering::Acquire);
|
||||
if restore_epoch > 0 {
|
||||
let now = SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap_or_default()
|
||||
.as_secs();
|
||||
if now.saturating_sub(restore_epoch) < RESTORE_GRACE_SECS {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
let mut sys = sysinfo::System::new();
|
||||
sys.refresh_memory();
|
||||
let total = sys.total_memory();
|
||||
|
||||
@ -57,7 +57,9 @@ impl Scanner {
|
||||
|
||||
pub async fn scan_and_broadcast(&self, cancel: CancellationToken) {
|
||||
loop {
|
||||
let conns = read_tcp_connections();
|
||||
let conns = tokio::task::spawn_blocking(read_tcp_connections)
|
||||
.await
|
||||
.unwrap_or_default();
|
||||
|
||||
{
|
||||
let subs = self.subs.read().unwrap();
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
use std::sync::atomic::{AtomicBool, AtomicU32, Ordering};
|
||||
use std::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::time::{SystemTime, UNIX_EPOCH};
|
||||
|
||||
use crate::auth::token::SecureToken;
|
||||
use crate::conntracker::ConnTracker;
|
||||
@ -11,7 +12,6 @@ pub struct AppState {
|
||||
pub defaults: Defaults,
|
||||
pub version: String,
|
||||
pub commit: String,
|
||||
pub is_fc: bool,
|
||||
pub needs_restore: AtomicBool,
|
||||
pub last_set_time: AtomicMax,
|
||||
pub access_token: SecureToken,
|
||||
@ -20,6 +20,8 @@ pub struct AppState {
|
||||
pub cpu_used_pct: AtomicU32,
|
||||
pub cpu_count: AtomicU32,
|
||||
pub snapshot_in_progress: AtomicBool,
|
||||
pub last_health_epoch: AtomicU64,
|
||||
pub restore_epoch: AtomicU64,
|
||||
}
|
||||
|
||||
impl AppState {
|
||||
@ -27,14 +29,12 @@ impl AppState {
|
||||
defaults: Defaults,
|
||||
version: String,
|
||||
commit: String,
|
||||
is_fc: bool,
|
||||
port_subsystem: Option<Arc<PortSubsystem>>,
|
||||
) -> Arc<Self> {
|
||||
let state = Arc::new(Self {
|
||||
defaults,
|
||||
version,
|
||||
commit,
|
||||
is_fc,
|
||||
needs_restore: AtomicBool::new(false),
|
||||
last_set_time: AtomicMax::new(),
|
||||
access_token: SecureToken::new(),
|
||||
@ -43,6 +43,8 @@ impl AppState {
|
||||
cpu_used_pct: AtomicU32::new(0),
|
||||
cpu_count: AtomicU32::new(0),
|
||||
snapshot_in_progress: AtomicBool::new(false),
|
||||
last_health_epoch: AtomicU64::new(0),
|
||||
restore_epoch: AtomicU64::new(0),
|
||||
});
|
||||
|
||||
let state_clone = Arc::clone(&state);
|
||||
@ -60,6 +62,47 @@ impl AppState {
|
||||
pub fn cpu_count(&self) -> u32 {
|
||||
self.cpu_count.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
/// Runs post-restore recovery if `needs_restore` is set OR a wall-clock
|
||||
/// gap is detected (catches restores where snapshot/prepare never ran).
|
||||
pub fn try_restore_recovery(&self) {
|
||||
let now_epoch = SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap_or_default()
|
||||
.as_secs();
|
||||
let prev_epoch = self.last_health_epoch.swap(now_epoch, Ordering::AcqRel);
|
||||
|
||||
// Detect restore via wall-clock gap: if >3s passed since last health
|
||||
// check, the VM was frozen and restored. Catches the case where
|
||||
// snapshot/prepare timed out and needs_restore was never set.
|
||||
let gap_detected = prev_epoch > 0 && now_epoch.saturating_sub(prev_epoch) > 3;
|
||||
|
||||
let flag_set = self
|
||||
.needs_restore
|
||||
.compare_exchange(true, false, Ordering::AcqRel, Ordering::Relaxed)
|
||||
.is_ok();
|
||||
|
||||
if !flag_set && !gap_detected {
|
||||
return;
|
||||
}
|
||||
|
||||
if gap_detected && !flag_set {
|
||||
tracing::info!(
|
||||
gap_secs = now_epoch.saturating_sub(prev_epoch),
|
||||
"restore: detected via wall-clock gap (needs_restore was not set)"
|
||||
);
|
||||
}
|
||||
|
||||
tracing::info!("restore: post-restore recovery");
|
||||
self.snapshot_in_progress.store(false, Ordering::Release);
|
||||
self.restore_epoch.store(now_epoch, Ordering::Release);
|
||||
self.conn_tracker.restore_after_snapshot();
|
||||
|
||||
if let Some(ref ps) = self.port_subsystem {
|
||||
ps.restart();
|
||||
tracing::info!("restore: port subsystem restarted");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn cpu_sampler(state: Arc<AppState>) {
|
||||
@ -70,6 +113,15 @@ fn cpu_sampler(state: Arc<AppState>) {
|
||||
|
||||
loop {
|
||||
std::thread::sleep(std::time::Duration::from_secs(1));
|
||||
|
||||
if state.needs_restore.load(Ordering::Acquire) {
|
||||
// After snapshot restore, sysinfo's internal CPU counters are stale.
|
||||
// Reinitialize to get a fresh baseline.
|
||||
sys = System::new();
|
||||
sys.refresh_cpu_all();
|
||||
continue;
|
||||
}
|
||||
|
||||
sys.refresh_cpu_all();
|
||||
|
||||
let pct = sys.global_cpu_usage();
|
||||
|
||||
Reference in New Issue
Block a user