huggingface · michaelfeil · Dec 17, 2025 · Dec 17, 2025 · Dec 17, 2025 · Dec 17, 2025
diff --git a/backends/candle/src/lib.rs b/backends/candle/src/lib.rs
@@ -581,6 +581,29 @@ impl Backend for CandleBackend {
     }
 
     fn health(&self) -> Result<(), BackendError> {
+        // Simple healthcheck by performing a trivial operation
+        // backend is almost unfailable, but e.g. Cuda OOM or cuda "device fallen off the bus"
+        // can be detected this way
+        use candle::Tensor;
+
+        // 1) enqueue a trivial op on the current device
+        let x = Tensor::new(&[1f32], &self.device).e()?;
+        let z = Tensor::new(&[2f32], &self.device).e()?;
+        let y = (&x * &z).e()?;
+
+        // 2) move storage to CPU to surface async CUDA errors by reading back
+        let v = y.to_vec1::<f32>().e()?;
+        if v.len() != 1 || (v[0] - 2.0).abs() > 1e-6 {
+            // michaelfeil: ideally, we should sleep here for 1.0s/5.0s to allow k8s to detect healthcheck failure
+            // without queuing further work on a possibly broken device by blocking the backend.
+            // and sending 429s in the meantime.
+            tracing::error!("Device {:?} healthcheck failed: expected [2.0], got {v:?}", self.device);
+            return Err(BackendError::Inference(format!(
+                "device healthcheck failed: expected [2.0], got {v:?}"
+            )));
+        }
+        tracing::debug!("Device {:?} healthcheck passed", self.device);
+
         Ok(())
     }