From 98be918abd59b424c750b09d19b053718cbf1186 Mon Sep 17 00:00:00 2001 From: michaelfeil Date: Wed, 17 Dec 2025 19:38:15 +0000 Subject: [PATCH 1/3] candle: health check by queuing on cuda # What does this PR do? Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/text-embeddings-inference/blob/main/CONTRIBUTING.md)? - [ ] Was this discussed/approved via a GitHub issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs). - [ ] Did you write any new necessary tests? If applicable, did you include or update the `insta` snapshots? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. --- backends/candle/src/lib.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/backends/candle/src/lib.rs b/backends/candle/src/lib.rs index ff824f555..739c494f1 100644 --- a/backends/candle/src/lib.rs +++ b/backends/candle/src/lib.rs @@ -581,6 +581,26 @@ impl Backend for CandleBackend { } fn health(&self) -> Result<(), BackendError> { + // Simple healthcheck by performing a trivial operation + // backend is almost unfailable, but e.g. Cuda OOM or cuda "device fallen off the bus" + // can be detected this way + use candle_core::Tensor; + + // 1) enqueue a trivial op on the current device + let x = Tensor::new(&[1f32], &self.device).e()?; + let y = (&x * 2f32).e()?; + + // 2) force completion + surface async CUDA errors by reading back + let v = y.to_vec1::().e()?; + if v.len() != 1 || (v[0] - 2.0).abs() > 1e-6 { + // ideally, we should sleep here for 1.0s to allow k8s to detect healthcheck failure + // without queuing further work on a possibly broken device by blocking the backend. + // and sending 429s in the meantime. + return Err(BackendError::Inference(format!( + "device healthcheck failed: expected [2.0], got {v:?}" + ))); + } + Ok(()) } From 28029afa757add6fc29b1cf88bcd1f40ab6f09c1 Mon Sep 17 00:00:00 2001 From: michaelfeil Date: Wed, 17 Dec 2025 19:45:50 +0000 Subject: [PATCH 2/3] add lib.rs --- backends/candle/src/lib.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/backends/candle/src/lib.rs b/backends/candle/src/lib.rs index 739c494f1..b3836afcd 100644 --- a/backends/candle/src/lib.rs +++ b/backends/candle/src/lib.rs @@ -584,16 +584,17 @@ impl Backend for CandleBackend { // Simple healthcheck by performing a trivial operation // backend is almost unfailable, but e.g. Cuda OOM or cuda "device fallen off the bus" // can be detected this way - use candle_core::Tensor; + use candle::Tensor; // 1) enqueue a trivial op on the current device let x = Tensor::new(&[1f32], &self.device).e()?; - let y = (&x * 2f32).e()?; + let z = Tensor::new(&[2f32], &self.device).e()?; + let y = (&x * &z).e()?; - // 2) force completion + surface async CUDA errors by reading back + // 2) move storage to CPU to surface async CUDA errors by reading back let v = y.to_vec1::().e()?; if v.len() != 1 || (v[0] - 2.0).abs() > 1e-6 { - // ideally, we should sleep here for 1.0s to allow k8s to detect healthcheck failure + // michaelfeil: ideally, we should sleep here for 1.0s/5.0s to allow k8s to detect healthcheck failure // without queuing further work on a possibly broken device by blocking the backend. // and sending 429s in the meantime. return Err(BackendError::Inference(format!( From 22a0c871c603fb3b2e90528cd5cd4be4aec76050 Mon Sep 17 00:00:00 2001 From: michaelfeil Date: Wed, 17 Dec 2025 19:53:04 +0000 Subject: [PATCH 3/3] add logging --- backends/candle/src/lib.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backends/candle/src/lib.rs b/backends/candle/src/lib.rs index b3836afcd..5c3f41faa 100644 --- a/backends/candle/src/lib.rs +++ b/backends/candle/src/lib.rs @@ -597,10 +597,12 @@ impl Backend for CandleBackend { // michaelfeil: ideally, we should sleep here for 1.0s/5.0s to allow k8s to detect healthcheck failure // without queuing further work on a possibly broken device by blocking the backend. // and sending 429s in the meantime. + tracing::error!("Device {:?} healthcheck failed: expected [2.0], got {v:?}", self.device); return Err(BackendError::Inference(format!( "device healthcheck failed: expected [2.0], got {v:?}" ))); } + tracing::debug!("Device {:?} healthcheck passed", self.device); Ok(()) }