Skip to content

Commit a3529a8

Browse files
committed
fixed CR comments
1 parent f273517 commit a3529a8

File tree

2 files changed

+202
-0
lines changed

2 files changed

+202
-0
lines changed

src/google/adk/evaluation/local_eval_service.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,20 @@ async def _evaluate_single_inference_result(
226226
else 'test_user_id'
227227
)
228228

229+
if inference_result.status == InferenceStatus.FAILURE or \
230+
inference_result.inferences is None:
231+
logger.error(
232+
'Evaluation attempted on failed inference for eval case `%s`.'
233+
' Error: %s',
234+
inference_result.eval_case_id,
235+
inference_result.error_message
236+
)
237+
eval_case_result = await self._build_not_evaluated_eval_case_result(
238+
inference_result=inference_result,
239+
user_id=user_id,
240+
)
241+
return (inference_result, eval_case_result)
242+
229243
if eval_case.conversation_scenario is None and len(
230244
inference_result.inferences
231245
) != len(eval_case.conversation):
@@ -389,6 +403,31 @@ def _generate_final_eval_status(
389403

390404
return final_eval_status
391405

406+
async def _build_not_evaluated_eval_case_result(
407+
self,
408+
*,
409+
inference_result: InferenceResult,
410+
user_id: str,
411+
) -> EvalCaseResult:
412+
"""Constructs an EvalCaseResult for cases that could not be evaluated."""
413+
session_details = await self._session_service.get_session(
414+
app_name=inference_result.app_name,
415+
user_id=user_id,
416+
session_id=inference_result.session_id,
417+
)
418+
419+
return EvalCaseResult(
420+
eval_set_file=inference_result.eval_set_id,
421+
eval_set_id=inference_result.eval_set_id,
422+
eval_id=inference_result.eval_case_id,
423+
final_eval_status=EvalStatus.NOT_EVALUATED,
424+
overall_eval_metric_results=[],
425+
eval_metric_result_per_invocation=[],
426+
session_id=inference_result.session_id,
427+
session_details=session_details,
428+
user_id=user_id,
429+
)
430+
392431
async def _perform_inference_single_eval_item(
393432
self,
394433
app_name: str,

tests/unittests/evaluation/test_local_eval_service.py

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,82 @@ async def test_evaluate_success(
314314
assert mock_eval_set_results_manager.save_eval_set_result.call_count == 2
315315

316316

317+
@pytest.mark.asyncio
318+
async def test_evaluate_skips_failed_inference_results(
319+
eval_service, mock_eval_sets_manager, mock_eval_set_results_manager, mocker
320+
):
321+
invocation = Invocation(
322+
user_content=genai_types.Content(
323+
parts=[genai_types.Part(text="test user content.")]
324+
),
325+
final_response=genai_types.Content(
326+
parts=[genai_types.Part(text="test final response.")]
327+
),
328+
)
329+
inference_results = [
330+
InferenceResult(
331+
app_name="test_app",
332+
eval_set_id="test_eval_set",
333+
eval_case_id="case_failure",
334+
inferences=None,
335+
session_id="session_fail",
336+
status=InferenceStatus.FAILURE,
337+
error_message="simulated failure",
338+
),
339+
InferenceResult(
340+
app_name="test_app",
341+
eval_set_id="test_eval_set",
342+
eval_case_id="case_success",
343+
inferences=[invocation.model_copy(deep=True)],
344+
session_id="session_success",
345+
status=InferenceStatus.SUCCESS,
346+
),
347+
InferenceResult(
348+
app_name="test_app",
349+
eval_set_id="test_eval_set",
350+
eval_case_id="case_unknown",
351+
inferences=[invocation.model_copy(deep=True)],
352+
session_id="session_unknown",
353+
status=InferenceStatus.UNKNOWN,
354+
),
355+
]
356+
eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5)
357+
evaluate_request = EvaluateRequest(
358+
inference_results=inference_results,
359+
evaluate_config=EvaluateConfig(eval_metrics=[eval_metric], parallelism=2),
360+
)
361+
362+
mock_eval_case = mocker.MagicMock(spec=EvalCase)
363+
mock_eval_case.conversation = [invocation.model_copy(deep=True)]
364+
mock_eval_case.conversation_scenario = None
365+
mock_eval_case.session_input = None
366+
mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case
367+
368+
results = []
369+
async for result in eval_service.evaluate(evaluate_request):
370+
results.append(result)
371+
372+
assert len(results) == 3
373+
results_by_case = {result.eval_id: result for result in results}
374+
375+
failure_result = results_by_case['case_failure']
376+
assert failure_result.final_eval_status == EvalStatus.NOT_EVALUATED
377+
assert failure_result.overall_eval_metric_results == []
378+
assert failure_result.eval_metric_result_per_invocation == []
379+
380+
for case_id in ['case_success', 'case_unknown']:
381+
case_result = results_by_case[case_id]
382+
assert case_result.final_eval_status == EvalStatus.PASSED
383+
assert len(case_result.overall_eval_metric_results) == 1
384+
assert (
385+
case_result.overall_eval_metric_results[0].metric_name == 'fake_metric'
386+
)
387+
assert case_result.overall_eval_metric_results[0].score == 0.9
388+
389+
assert mock_eval_sets_manager.get_eval_case.call_count == 3
390+
assert mock_eval_set_results_manager.save_eval_set_result.call_count == 3
391+
392+
317393
@pytest.mark.asyncio
318394
async def test_evaluate_eval_case_not_found(
319395
eval_service,
@@ -407,6 +483,93 @@ async def test_evaluate_single_inference_result(
407483
assert metric_result.eval_status == EvalStatus.PASSED
408484

409485

486+
@pytest.mark.asyncio
487+
async def test_evaluate_single_inference_result_handles_failed_inference(
488+
eval_service, mock_eval_sets_manager, mocker
489+
):
490+
invocation = Invocation(
491+
user_content=genai_types.Content(
492+
parts=[genai_types.Part(text="test user content.")]
493+
),
494+
final_response=genai_types.Content(
495+
parts=[genai_types.Part(text="test final response.")]
496+
),
497+
)
498+
inference_result = InferenceResult(
499+
app_name="test_app",
500+
eval_set_id="test_eval_set",
501+
eval_case_id="case1",
502+
inferences=None,
503+
session_id="session1",
504+
status=InferenceStatus.FAILURE,
505+
error_message="simulated inference failure",
506+
)
507+
eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5)
508+
evaluate_config = EvaluateConfig(eval_metrics=[eval_metric], parallelism=1)
509+
510+
mock_eval_case = mocker.MagicMock(spec=EvalCase)
511+
mock_eval_case.conversation = [invocation.model_copy(deep=True)]
512+
mock_eval_case.conversation_scenario = None
513+
mock_eval_case.session_input = None
514+
mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case
515+
516+
_, result = await eval_service._evaluate_single_inference_result(
517+
inference_result=inference_result, evaluate_config=evaluate_config
518+
)
519+
520+
assert isinstance(result, EvalCaseResult)
521+
assert result.eval_id == "case1"
522+
assert result.final_eval_status == EvalStatus.NOT_EVALUATED
523+
assert result.overall_eval_metric_results == []
524+
assert result.eval_metric_result_per_invocation == []
525+
mock_eval_sets_manager.get_eval_case.assert_called_once_with(
526+
app_name="test_app", eval_set_id="test_eval_set", eval_case_id="case1"
527+
)
528+
529+
530+
@pytest.mark.asyncio
531+
async def test_evaluate_single_inference_result_handles_missing_inferences(
532+
eval_service, mock_eval_sets_manager, mocker
533+
):
534+
invocation = Invocation(
535+
user_content=genai_types.Content(
536+
parts=[genai_types.Part(text="test user content.")]
537+
),
538+
final_response=genai_types.Content(
539+
parts=[genai_types.Part(text="test final response.")]
540+
),
541+
)
542+
inference_result = InferenceResult(
543+
app_name="test_app",
544+
eval_set_id="test_eval_set",
545+
eval_case_id="case1",
546+
inferences=None,
547+
session_id="session1",
548+
status=InferenceStatus.SUCCESS,
549+
)
550+
eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5)
551+
evaluate_config = EvaluateConfig(eval_metrics=[eval_metric], parallelism=1)
552+
553+
mock_eval_case = mocker.MagicMock(spec=EvalCase)
554+
mock_eval_case.conversation = [invocation.model_copy(deep=True)]
555+
mock_eval_case.conversation_scenario = None
556+
mock_eval_case.session_input = None
557+
mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case
558+
559+
_, result = await eval_service._evaluate_single_inference_result(
560+
inference_result=inference_result, evaluate_config=evaluate_config
561+
)
562+
563+
assert isinstance(result, EvalCaseResult)
564+
assert result.eval_id == "case1"
565+
assert result.final_eval_status == EvalStatus.NOT_EVALUATED
566+
assert result.overall_eval_metric_results == []
567+
assert result.eval_metric_result_per_invocation == []
568+
mock_eval_sets_manager.get_eval_case.assert_called_once_with(
569+
app_name="test_app", eval_set_id="test_eval_set", eval_case_id="case1"
570+
)
571+
572+
410573
@pytest.mark.asyncio
411574
async def test_evaluate_single_inference_result_for_conversation_scenario(
412575
eval_service, mock_eval_sets_manager, mocker

0 commit comments

Comments
 (0)