@@ -314,6 +314,82 @@ async def test_evaluate_success(
314314 assert mock_eval_set_results_manager .save_eval_set_result .call_count == 2
315315
316316
317+ @pytest .mark .asyncio
318+ async def test_evaluate_skips_failed_inference_results (
319+ eval_service , mock_eval_sets_manager , mock_eval_set_results_manager , mocker
320+ ):
321+ invocation = Invocation (
322+ user_content = genai_types .Content (
323+ parts = [genai_types .Part (text = "test user content." )]
324+ ),
325+ final_response = genai_types .Content (
326+ parts = [genai_types .Part (text = "test final response." )]
327+ ),
328+ )
329+ inference_results = [
330+ InferenceResult (
331+ app_name = "test_app" ,
332+ eval_set_id = "test_eval_set" ,
333+ eval_case_id = "case_failure" ,
334+ inferences = None ,
335+ session_id = "session_fail" ,
336+ status = InferenceStatus .FAILURE ,
337+ error_message = "simulated failure" ,
338+ ),
339+ InferenceResult (
340+ app_name = "test_app" ,
341+ eval_set_id = "test_eval_set" ,
342+ eval_case_id = "case_success" ,
343+ inferences = [invocation .model_copy (deep = True )],
344+ session_id = "session_success" ,
345+ status = InferenceStatus .SUCCESS ,
346+ ),
347+ InferenceResult (
348+ app_name = "test_app" ,
349+ eval_set_id = "test_eval_set" ,
350+ eval_case_id = "case_unknown" ,
351+ inferences = [invocation .model_copy (deep = True )],
352+ session_id = "session_unknown" ,
353+ status = InferenceStatus .UNKNOWN ,
354+ ),
355+ ]
356+ eval_metric = EvalMetric (metric_name = "fake_metric" , threshold = 0.5 )
357+ evaluate_request = EvaluateRequest (
358+ inference_results = inference_results ,
359+ evaluate_config = EvaluateConfig (eval_metrics = [eval_metric ], parallelism = 2 ),
360+ )
361+
362+ mock_eval_case = mocker .MagicMock (spec = EvalCase )
363+ mock_eval_case .conversation = [invocation .model_copy (deep = True )]
364+ mock_eval_case .conversation_scenario = None
365+ mock_eval_case .session_input = None
366+ mock_eval_sets_manager .get_eval_case .return_value = mock_eval_case
367+
368+ results = []
369+ async for result in eval_service .evaluate (evaluate_request ):
370+ results .append (result )
371+
372+ assert len (results ) == 3
373+ results_by_case = {result .eval_id : result for result in results }
374+
375+ failure_result = results_by_case ['case_failure' ]
376+ assert failure_result .final_eval_status == EvalStatus .NOT_EVALUATED
377+ assert failure_result .overall_eval_metric_results == []
378+ assert failure_result .eval_metric_result_per_invocation == []
379+
380+ for case_id in ['case_success' , 'case_unknown' ]:
381+ case_result = results_by_case [case_id ]
382+ assert case_result .final_eval_status == EvalStatus .PASSED
383+ assert len (case_result .overall_eval_metric_results ) == 1
384+ assert (
385+ case_result .overall_eval_metric_results [0 ].metric_name == 'fake_metric'
386+ )
387+ assert case_result .overall_eval_metric_results [0 ].score == 0.9
388+
389+ assert mock_eval_sets_manager .get_eval_case .call_count == 3
390+ assert mock_eval_set_results_manager .save_eval_set_result .call_count == 3
391+
392+
317393@pytest .mark .asyncio
318394async def test_evaluate_eval_case_not_found (
319395 eval_service ,
@@ -407,6 +483,93 @@ async def test_evaluate_single_inference_result(
407483 assert metric_result .eval_status == EvalStatus .PASSED
408484
409485
486+ @pytest .mark .asyncio
487+ async def test_evaluate_single_inference_result_handles_failed_inference (
488+ eval_service , mock_eval_sets_manager , mocker
489+ ):
490+ invocation = Invocation (
491+ user_content = genai_types .Content (
492+ parts = [genai_types .Part (text = "test user content." )]
493+ ),
494+ final_response = genai_types .Content (
495+ parts = [genai_types .Part (text = "test final response." )]
496+ ),
497+ )
498+ inference_result = InferenceResult (
499+ app_name = "test_app" ,
500+ eval_set_id = "test_eval_set" ,
501+ eval_case_id = "case1" ,
502+ inferences = None ,
503+ session_id = "session1" ,
504+ status = InferenceStatus .FAILURE ,
505+ error_message = "simulated inference failure" ,
506+ )
507+ eval_metric = EvalMetric (metric_name = "fake_metric" , threshold = 0.5 )
508+ evaluate_config = EvaluateConfig (eval_metrics = [eval_metric ], parallelism = 1 )
509+
510+ mock_eval_case = mocker .MagicMock (spec = EvalCase )
511+ mock_eval_case .conversation = [invocation .model_copy (deep = True )]
512+ mock_eval_case .conversation_scenario = None
513+ mock_eval_case .session_input = None
514+ mock_eval_sets_manager .get_eval_case .return_value = mock_eval_case
515+
516+ _ , result = await eval_service ._evaluate_single_inference_result (
517+ inference_result = inference_result , evaluate_config = evaluate_config
518+ )
519+
520+ assert isinstance (result , EvalCaseResult )
521+ assert result .eval_id == "case1"
522+ assert result .final_eval_status == EvalStatus .NOT_EVALUATED
523+ assert result .overall_eval_metric_results == []
524+ assert result .eval_metric_result_per_invocation == []
525+ mock_eval_sets_manager .get_eval_case .assert_called_once_with (
526+ app_name = "test_app" , eval_set_id = "test_eval_set" , eval_case_id = "case1"
527+ )
528+
529+
530+ @pytest .mark .asyncio
531+ async def test_evaluate_single_inference_result_handles_missing_inferences (
532+ eval_service , mock_eval_sets_manager , mocker
533+ ):
534+ invocation = Invocation (
535+ user_content = genai_types .Content (
536+ parts = [genai_types .Part (text = "test user content." )]
537+ ),
538+ final_response = genai_types .Content (
539+ parts = [genai_types .Part (text = "test final response." )]
540+ ),
541+ )
542+ inference_result = InferenceResult (
543+ app_name = "test_app" ,
544+ eval_set_id = "test_eval_set" ,
545+ eval_case_id = "case1" ,
546+ inferences = None ,
547+ session_id = "session1" ,
548+ status = InferenceStatus .SUCCESS ,
549+ )
550+ eval_metric = EvalMetric (metric_name = "fake_metric" , threshold = 0.5 )
551+ evaluate_config = EvaluateConfig (eval_metrics = [eval_metric ], parallelism = 1 )
552+
553+ mock_eval_case = mocker .MagicMock (spec = EvalCase )
554+ mock_eval_case .conversation = [invocation .model_copy (deep = True )]
555+ mock_eval_case .conversation_scenario = None
556+ mock_eval_case .session_input = None
557+ mock_eval_sets_manager .get_eval_case .return_value = mock_eval_case
558+
559+ _ , result = await eval_service ._evaluate_single_inference_result (
560+ inference_result = inference_result , evaluate_config = evaluate_config
561+ )
562+
563+ assert isinstance (result , EvalCaseResult )
564+ assert result .eval_id == "case1"
565+ assert result .final_eval_status == EvalStatus .NOT_EVALUATED
566+ assert result .overall_eval_metric_results == []
567+ assert result .eval_metric_result_per_invocation == []
568+ mock_eval_sets_manager .get_eval_case .assert_called_once_with (
569+ app_name = "test_app" , eval_set_id = "test_eval_set" , eval_case_id = "case1"
570+ )
571+
572+
410573@pytest .mark .asyncio
411574async def test_evaluate_single_inference_result_for_conversation_scenario (
412575 eval_service , mock_eval_sets_manager , mocker
0 commit comments