Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions src/google/adk/evaluation/local_eval_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,20 @@ async def _evaluate_single_inference_result(
else 'test_user_id'
)

if inference_result.status == InferenceStatus.FAILURE or \
inference_result.inferences is None:
logger.error(
'Evaluation attempted on failed inference for eval case `%s`.'
' Error: %s',
inference_result.eval_case_id,
inference_result.error_message
)
eval_case_result = await self._build_not_evaluated_eval_case_result(
inference_result=inference_result,
user_id=user_id,
)
return (inference_result, eval_case_result)

if eval_case.conversation_scenario is None and len(
inference_result.inferences
) != len(eval_case.conversation):
Expand Down Expand Up @@ -389,6 +403,31 @@ def _generate_final_eval_status(

return final_eval_status

async def _build_not_evaluated_eval_case_result(
self,
*,
inference_result: InferenceResult,
user_id: str,
) -> EvalCaseResult:
"""Constructs an EvalCaseResult for cases that could not be evaluated."""
session_details = await self._session_service.get_session(
app_name=inference_result.app_name,
user_id=user_id,
session_id=inference_result.session_id,
)

return EvalCaseResult(
eval_set_file=inference_result.eval_set_id,
eval_set_id=inference_result.eval_set_id,
eval_id=inference_result.eval_case_id,
final_eval_status=EvalStatus.NOT_EVALUATED,
overall_eval_metric_results=[],
eval_metric_result_per_invocation=[],
session_id=inference_result.session_id,
session_details=session_details,
user_id=user_id,
)

async def _perform_inference_single_eval_item(
self,
app_name: str,
Expand Down
163 changes: 163 additions & 0 deletions tests/unittests/evaluation/test_local_eval_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,82 @@ async def test_evaluate_success(
assert mock_eval_set_results_manager.save_eval_set_result.call_count == 2


@pytest.mark.asyncio
async def test_evaluate_skips_failed_inference_results(
eval_service, mock_eval_sets_manager, mock_eval_set_results_manager, mocker
):
invocation = Invocation(
user_content=genai_types.Content(
parts=[genai_types.Part(text="test user content.")]
),
final_response=genai_types.Content(
parts=[genai_types.Part(text="test final response.")]
),
)
inference_results = [
InferenceResult(
app_name="test_app",
eval_set_id="test_eval_set",
eval_case_id="case_failure",
inferences=None,
session_id="session_fail",
status=InferenceStatus.FAILURE,
error_message="simulated failure",
),
InferenceResult(
app_name="test_app",
eval_set_id="test_eval_set",
eval_case_id="case_success",
inferences=[invocation.model_copy(deep=True)],
session_id="session_success",
status=InferenceStatus.SUCCESS,
),
InferenceResult(
app_name="test_app",
eval_set_id="test_eval_set",
eval_case_id="case_unknown",
inferences=[invocation.model_copy(deep=True)],
session_id="session_unknown",
status=InferenceStatus.UNKNOWN,
),
]
eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5)
evaluate_request = EvaluateRequest(
inference_results=inference_results,
evaluate_config=EvaluateConfig(eval_metrics=[eval_metric], parallelism=2),
)

mock_eval_case = mocker.MagicMock(spec=EvalCase)
mock_eval_case.conversation = [invocation.model_copy(deep=True)]
mock_eval_case.conversation_scenario = None
mock_eval_case.session_input = None
mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case

results = []
async for result in eval_service.evaluate(evaluate_request):
results.append(result)

assert len(results) == 3
results_by_case = {result.eval_id: result for result in results}

failure_result = results_by_case['case_failure']
assert failure_result.final_eval_status == EvalStatus.NOT_EVALUATED
assert failure_result.overall_eval_metric_results == []
assert failure_result.eval_metric_result_per_invocation == []

for case_id in ['case_success', 'case_unknown']:
case_result = results_by_case[case_id]
assert case_result.final_eval_status == EvalStatus.PASSED
assert len(case_result.overall_eval_metric_results) == 1
assert (
case_result.overall_eval_metric_results[0].metric_name == 'fake_metric'
)
assert case_result.overall_eval_metric_results[0].score == 0.9

assert mock_eval_sets_manager.get_eval_case.call_count == 3
assert mock_eval_set_results_manager.save_eval_set_result.call_count == 3


@pytest.mark.asyncio
async def test_evaluate_eval_case_not_found(
eval_service,
Expand Down Expand Up @@ -407,6 +483,93 @@ async def test_evaluate_single_inference_result(
assert metric_result.eval_status == EvalStatus.PASSED


@pytest.mark.asyncio
async def test_evaluate_single_inference_result_handles_failed_inference(
eval_service, mock_eval_sets_manager, mocker
):
invocation = Invocation(
user_content=genai_types.Content(
parts=[genai_types.Part(text="test user content.")]
),
final_response=genai_types.Content(
parts=[genai_types.Part(text="test final response.")]
),
)
inference_result = InferenceResult(
app_name="test_app",
eval_set_id="test_eval_set",
eval_case_id="case1",
inferences=None,
session_id="session1",
status=InferenceStatus.FAILURE,
error_message="simulated inference failure",
)
eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5)
evaluate_config = EvaluateConfig(eval_metrics=[eval_metric], parallelism=1)

mock_eval_case = mocker.MagicMock(spec=EvalCase)
mock_eval_case.conversation = [invocation.model_copy(deep=True)]
mock_eval_case.conversation_scenario = None
mock_eval_case.session_input = None
mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case

_, result = await eval_service._evaluate_single_inference_result(
inference_result=inference_result, evaluate_config=evaluate_config
)

assert isinstance(result, EvalCaseResult)
assert result.eval_id == "case1"
assert result.final_eval_status == EvalStatus.NOT_EVALUATED
assert result.overall_eval_metric_results == []
assert result.eval_metric_result_per_invocation == []
mock_eval_sets_manager.get_eval_case.assert_called_once_with(
app_name="test_app", eval_set_id="test_eval_set", eval_case_id="case1"
)
Comment on lines +487 to +527
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This is a great test for handling failed inferences! To improve test coverage for the new logic in _evaluate_single_inference_result, could you please add another test case? Specifically, one that covers the scenario where inference_result.status is not FAILURE (e.g., SUCCESS or UNKNOWN), but inference_result.inferences is None. This would ensure the second if block in _evaluate_single_inference_result is also tested.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added another test_evaluate_single_inference_result_handles_missing_inferences to cover this case.



@pytest.mark.asyncio
async def test_evaluate_single_inference_result_handles_missing_inferences(
eval_service, mock_eval_sets_manager, mocker
):
invocation = Invocation(
user_content=genai_types.Content(
parts=[genai_types.Part(text="test user content.")]
),
final_response=genai_types.Content(
parts=[genai_types.Part(text="test final response.")]
),
)
inference_result = InferenceResult(
app_name="test_app",
eval_set_id="test_eval_set",
eval_case_id="case1",
inferences=None,
session_id="session1",
status=InferenceStatus.SUCCESS,
)
eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5)
evaluate_config = EvaluateConfig(eval_metrics=[eval_metric], parallelism=1)

mock_eval_case = mocker.MagicMock(spec=EvalCase)
mock_eval_case.conversation = [invocation.model_copy(deep=True)]
mock_eval_case.conversation_scenario = None
mock_eval_case.session_input = None
mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case

_, result = await eval_service._evaluate_single_inference_result(
inference_result=inference_result, evaluate_config=evaluate_config
)

assert isinstance(result, EvalCaseResult)
assert result.eval_id == "case1"
assert result.final_eval_status == EvalStatus.NOT_EVALUATED
assert result.overall_eval_metric_results == []
assert result.eval_metric_result_per_invocation == []
mock_eval_sets_manager.get_eval_case.assert_called_once_with(
app_name="test_app", eval_set_id="test_eval_set", eval_case_id="case1"
)


@pytest.mark.asyncio
async def test_evaluate_single_inference_result_for_conversation_scenario(
eval_service, mock_eval_sets_manager, mocker
Expand Down