diff --git a/backends/candle/src/lib.rs b/backends/candle/src/lib.rs index ff824f555..5c3f41faa 100644 --- a/backends/candle/src/lib.rs +++ b/backends/candle/src/lib.rs @@ -581,6 +581,29 @@ impl Backend for CandleBackend { } fn health(&self) -> Result<(), BackendError> { + // Simple healthcheck by performing a trivial operation + // backend is almost unfailable, but e.g. Cuda OOM or cuda "device fallen off the bus" + // can be detected this way + use candle::Tensor; + + // 1) enqueue a trivial op on the current device + let x = Tensor::new(&[1f32], &self.device).e()?; + let z = Tensor::new(&[2f32], &self.device).e()?; + let y = (&x * &z).e()?; + + // 2) move storage to CPU to surface async CUDA errors by reading back + let v = y.to_vec1::().e()?; + if v.len() != 1 || (v[0] - 2.0).abs() > 1e-6 { + // michaelfeil: ideally, we should sleep here for 1.0s/5.0s to allow k8s to detect healthcheck failure + // without queuing further work on a possibly broken device by blocking the backend. + // and sending 429s in the meantime. + tracing::error!("Device {:?} healthcheck failed: expected [2.0], got {v:?}", self.device); + return Err(BackendError::Inference(format!( + "device healthcheck failed: expected [2.0], got {v:?}" + ))); + } + tracing::debug!("Device {:?} healthcheck passed", self.device); + Ok(()) }