From 3ffdf0cf6ffc94b7c711acceba35b2fd482a1dd8 Mon Sep 17 00:00:00 2001 From: Akshath Mangudi Date: Fri, 21 Nov 2025 17:53:14 +0530 Subject: [PATCH 1/5] initial commit --- src/lighteval/tasks/tasks/multi_challenge/__init__.py | 0 src/lighteval/tasks/tasks/multi_challenge/main.py | 5 +++++ 2 files changed, 5 insertions(+) create mode 100644 src/lighteval/tasks/tasks/multi_challenge/__init__.py create mode 100644 src/lighteval/tasks/tasks/multi_challenge/main.py diff --git a/src/lighteval/tasks/tasks/multi_challenge/__init__.py b/src/lighteval/tasks/tasks/multi_challenge/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/lighteval/tasks/tasks/multi_challenge/main.py b/src/lighteval/tasks/tasks/multi_challenge/main.py new file mode 100644 index 000000000..40a463016 --- /dev/null +++ b/src/lighteval/tasks/tasks/multi_challenge/main.py @@ -0,0 +1,5 @@ +""" +DOCSTRING TO BE IMPLEMENTED. + +Entry point for the Multi-Challenge task. +""" From d4cda4470e78b8856a2196f8e66bb094f8be66b3 Mon Sep 17 00:00:00 2001 From: Akshath Mangudi Date: Sat, 22 Nov 2025 17:20:34 +0530 Subject: [PATCH 2/5] multi challenge impl, ready for review --- src/lighteval/tasks/tasks/multi_challenge.py | 166 ++++++++++++++++++ .../tasks/tasks/multi_challenge/__init__.py | 0 .../tasks/tasks/multi_challenge/main.py | 5 - 3 files changed, 166 insertions(+), 5 deletions(-) create mode 100644 src/lighteval/tasks/tasks/multi_challenge.py delete mode 100644 src/lighteval/tasks/tasks/multi_challenge/__init__.py delete mode 100644 src/lighteval/tasks/tasks/multi_challenge/main.py diff --git a/src/lighteval/tasks/tasks/multi_challenge.py b/src/lighteval/tasks/tasks/multi_challenge.py new file mode 100644 index 000000000..92b661340 --- /dev/null +++ b/src/lighteval/tasks/tasks/multi_challenge.py @@ -0,0 +1,166 @@ +""" +name: +MultiChallenge + +dataset: +nmayorga7/multichallenge + +abstract: +MultiChallenge is a realistic multi-turn conversation evaluation benchmark challenging +to frontier LLMs. It identifies four categories of challenges in multi-turn conversations +that require accurate instruction-following, context allocation, and in-context reasoning +simultaneously. All frontier models achieve less than 50% accuracy on MultiChallenge. + +languages: +english + +tags: +conversational, generation, multi-turn, instruction-following + +paper: +https://arxiv.org/abs/2501.17399 +""" + +import re + +from inspect_ai.dataset import Sample +from inspect_ai.model import get_model +from inspect_ai.scorer import Score, Target, accuracy, scorer, stderr +from inspect_ai.solver import TaskState, generate + +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc + + +JUDGE_PROMPT = """You are tasked with evaluating a model response to see if it meets a specific criteria. +The criteria will always be YES/NO evaluation. + +The model response is as follows: + +{} + + +The criteria that the model response must meet is as follows. Be VERY STRICT!: + +{} + + +Print your reasoning followed by your verdict, either "YES" or "NO".""" + + +def format_conversation(conversation: list[dict]) -> str: + """Format conversation messages into a single string for model input.""" + formatted_messages = [] + for msg in conversation: + role = msg["role"].upper() + content = msg["content"] + formatted_messages.append(f"{role}:\n{content}") + + return "\n\n".join(formatted_messages) + + +def multi_challenge_prompt(line, task_name: str = None): + """Convert dataset to Doc object""" + + conversation = line["CONVERSATION"] + formatted_conv = format_conversation(conversation) + return Doc( + task_name=task_name, + query=formatted_conv, + instruction=None, + specific={ + "question_id": line["QUESTION_ID"], + "axis": line["AXIS"], + "target_question": line["TARGET_QUESTION"], + "pass_criteria": line["PASS_CRITERIA"], + "conversation": conversation, + }, + ) + + +@scorer(metrics=[accuracy(), stderr()]) +def multi_challenge_scorer(): + async def score(state: TaskState, target: Target): + response = state.output.completion + + target_question = target.text + pass_criteria = state.metadata.get("pass_criteria", "YES") + + if not target_question: + return Score( + value="I", + answer=response, + explanation="Target question not found.", + ) + + try: + judge_model = get_model("openai/gpt-4o-2024-08-06") + judge_prompt = JUDGE_PROMPT.format(response, target_question) + + judge_result = await judge_model.generate(judge_prompt) + judge_output = judge_result.completion + + verdict_match = re.search(r"\b(YES|NO)\b", judge_output, re.IGNORECASE) + + if not verdict_match: + return Score( + value="I", + answer=response, + explanation=f"Could not extract verdict from judge output: {judge_output}.", + ) + + judge_verdict = verdict_match.group(1).upper() + passed = judge_verdict == pass_criteria + + return Score( + value="C" if passed else "I", + answer=response, + explanation=f"Judge verdict: {judge_verdict}, Expected: {pass_criteria}, Response: {response}.", + ) + + except Exception as e: + return Score( + value="I", + answer=response, + explanation=f"Error during judge evaluation: {str(e)}.", + ) + + return score + + +def record_to_sample(record: dict) -> Sample: + """Convert dataset record to inspect-ai Sample object.""" + conversation = record["CONVERSATION"] + formatted_conv = format_conversation(conversation) + + return Sample( + input=formatted_conv, + target=record["TARGET_QUESTION"], + metadata={ + "question_id": record["QUESTION_ID"], + "axis": record["AXIS"], + "pass_criteria": record["PASS_CRITERIA"], + "conversation": conversation, + }, + ) + + +multi_challenge = LightevalTaskConfig( + name="multi_challenge", + prompt_function=multi_challenge_prompt, + hf_repo="nmayorga7/multichallenge", + hf_subset="default", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + stop_sequence=[], + version=0, + sample_fields=record_to_sample, + metrics=[], # Metrics are defined in the scorer decorator for inspect-ai tasks + solver=[generate(cache=True)], + scorer=multi_challenge_scorer(), +) + +TASKS_TABLE = [multi_challenge] diff --git a/src/lighteval/tasks/tasks/multi_challenge/__init__.py b/src/lighteval/tasks/tasks/multi_challenge/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/lighteval/tasks/tasks/multi_challenge/main.py b/src/lighteval/tasks/tasks/multi_challenge/main.py deleted file mode 100644 index 40a463016..000000000 --- a/src/lighteval/tasks/tasks/multi_challenge/main.py +++ /dev/null @@ -1,5 +0,0 @@ -""" -DOCSTRING TO BE IMPLEMENTED. - -Entry point for the Multi-Challenge task. -""" From f1d82efe6eed69017a7bc23338a16cb0c29b31d8 Mon Sep 17 00:00:00 2001 From: Akshath Mangudi Date: Sat, 22 Nov 2025 17:24:30 +0530 Subject: [PATCH 3/5] docstring fixes --- src/lighteval/tasks/tasks/multi_challenge.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/lighteval/tasks/tasks/multi_challenge.py b/src/lighteval/tasks/tasks/multi_challenge.py index 92b661340..c771a9fe8 100644 --- a/src/lighteval/tasks/tasks/multi_challenge.py +++ b/src/lighteval/tasks/tasks/multi_challenge.py @@ -6,16 +6,18 @@ nmayorga7/multichallenge abstract: -MultiChallenge is a realistic multi-turn conversation evaluation benchmark challenging -to frontier LLMs. It identifies four categories of challenges in multi-turn conversations -that require accurate instruction-following, context allocation, and in-context reasoning -simultaneously. All frontier models achieve less than 50% accuracy on MultiChallenge. +MultiChallenge evaluates large language models (LLMs) on their ability to +conduct multi-turn conversations with human users. +The model is given a target question belonging to one or +more axes (categories) and must provide a free-form answer. +The evaluation uses a secondary judge model to determine if the +answer satisfies the pass criteria for that question. languages: english tags: -conversational, generation, multi-turn, instruction-following +conversational, generation paper: https://arxiv.org/abs/2501.17399 From 4c572e5829252f4af8ed57c69c222185d173328f Mon Sep 17 00:00:00 2001 From: Akshath Mangudi Date: Tue, 25 Nov 2025 12:58:51 +0530 Subject: [PATCH 4/5] addressed comments --- src/lighteval/tasks/tasks/multi_challenge.py | 97 +++++++++++--------- 1 file changed, 53 insertions(+), 44 deletions(-) diff --git a/src/lighteval/tasks/tasks/multi_challenge.py b/src/lighteval/tasks/tasks/multi_challenge.py index c771a9fe8..0f8435645 100644 --- a/src/lighteval/tasks/tasks/multi_challenge.py +++ b/src/lighteval/tasks/tasks/multi_challenge.py @@ -17,18 +17,16 @@ english tags: -conversational, generation +conversational, generation, instruction-following paper: https://arxiv.org/abs/2501.17399 """ -import re - from inspect_ai.dataset import Sample -from inspect_ai.model import get_model -from inspect_ai.scorer import Score, Target, accuracy, scorer, stderr -from inspect_ai.solver import TaskState, generate +from inspect_ai.model._chat_message import ChatMessageAssistant, ChatMessageUser +from inspect_ai.scorer import Score, Target, accuracy, model_graded_fact, scorer, stderr +from inspect_ai.solver import Generate, TaskState, generate, solver from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc @@ -39,12 +37,12 @@ The model response is as follows: -{} +{answer} The criteria that the model response must meet is as follows. Be VERY STRICT!: -{} +{criterion} Print your reasoning followed by your verdict, either "YES" or "NO".""" @@ -80,69 +78,80 @@ def multi_challenge_prompt(line, task_name: str = None): ) +base_scorer = model_graded_fact( + template=JUDGE_PROMPT, + grade_pattern=r"\b(YES|NO)\b", + model="openai/gpt-4o-2024-08-06", +) + + @scorer(metrics=[accuracy(), stderr()]) def multi_challenge_scorer(): async def score(state: TaskState, target: Target): - response = state.output.completion - - target_question = target.text - pass_criteria = state.metadata.get("pass_criteria", "YES") + score = await base_scorer(state, target) + judge_verdict = score.value.upper() if score.value else None - if not target_question: + if not judge_verdict or judge_verdict not in ["YES", "NO"]: return Score( value="I", - answer=response, - explanation="Target question not found.", + answer=score.answer, + explanation=f"Could not extract valid verdict from judge output: {score.explanation}", ) - try: - judge_model = get_model("openai/gpt-4o-2024-08-06") - judge_prompt = JUDGE_PROMPT.format(response, target_question) + pass_criteria = state.metadata.get("pass_criteria", "YES") + passed = judge_verdict == pass_criteria - judge_result = await judge_model.generate(judge_prompt) - judge_output = judge_result.completion + return Score( + value="C" if passed else "I", + answer=score.answer, + explanation=score.explanation, + ) - verdict_match = re.search(r"\b(YES|NO)\b", judge_output, re.IGNORECASE) + return score - if not verdict_match: - return Score( - value="I", - answer=response, - explanation=f"Could not extract verdict from judge output: {judge_output}.", - ) - judge_verdict = verdict_match.group(1).upper() - passed = judge_verdict == pass_criteria +@solver +def conversation_solver(): + """Solver that builds conversation history from metadata.""" - return Score( - value="C" if passed else "I", - answer=response, - explanation=f"Judge verdict: {judge_verdict}, Expected: {pass_criteria}, Response: {response}.", - ) + async def solve(state: TaskState, generate: Generate): + conversation = state.metadata.get("conversation", []) - except Exception as e: - return Score( - value="I", - answer=response, - explanation=f"Error during judge evaluation: {str(e)}.", - ) + state.messages = [] - return score + for msg in conversation: + role = msg["role"].lower() + content = msg["content"] + + if role == "user": + state.messages.append(ChatMessageUser(content=content)) + elif role == "assistant": + state.messages.append(ChatMessageAssistant(content=content)) + + return state + + return solve def record_to_sample(record: dict) -> Sample: """Convert dataset record to inspect-ai Sample object.""" conversation = record["CONVERSATION"] - formatted_conv = format_conversation(conversation) + + last_msg = None + for msg in reversed(conversation): + if msg["role"] == "user": + last_msg = msg["content"] + break return Sample( - input=formatted_conv, + input=last_msg or "", target=record["TARGET_QUESTION"], metadata={ "question_id": record["QUESTION_ID"], "axis": record["AXIS"], "pass_criteria": record["PASS_CRITERIA"], "conversation": conversation, + "length": len(conversation), }, ) @@ -161,7 +170,7 @@ def record_to_sample(record: dict) -> Sample: version=0, sample_fields=record_to_sample, metrics=[], # Metrics are defined in the scorer decorator for inspect-ai tasks - solver=[generate(cache=True)], + solver=[conversation_solver(), generate(cache=True)], scorer=multi_challenge_scorer(), ) From d6aa3817f162fb2119ad42f2a5983b816ba9e5de Mon Sep 17 00:00:00 2001 From: Akshath Mangudi Date: Tue, 9 Dec 2025 21:09:38 +0530 Subject: [PATCH 5/5] addressed comments --- src/lighteval/tasks/tasks/multi_challenge.py | 61 +++++++++----------- 1 file changed, 28 insertions(+), 33 deletions(-) diff --git a/src/lighteval/tasks/tasks/multi_challenge.py b/src/lighteval/tasks/tasks/multi_challenge.py index 0f8435645..684bf27b5 100644 --- a/src/lighteval/tasks/tasks/multi_challenge.py +++ b/src/lighteval/tasks/tasks/multi_challenge.py @@ -21,6 +21,9 @@ paper: https://arxiv.org/abs/2501.17399 + +starred: +true """ from inspect_ai.dataset import Sample @@ -32,6 +35,9 @@ from lighteval.tasks.requests import Doc +# NOTE: ChatMessageAssistant and ChatMessageUser are imported from a private module. + + JUDGE_PROMPT = """You are tasked with evaluating a model response to see if it meets a specific criteria. The criteria will always be YES/NO evaluation. @@ -48,45 +54,24 @@ Print your reasoning followed by your verdict, either "YES" or "NO".""" -def format_conversation(conversation: list[dict]) -> str: - """Format conversation messages into a single string for model input.""" - formatted_messages = [] - for msg in conversation: - role = msg["role"].upper() - content = msg["content"] - formatted_messages.append(f"{role}:\n{content}") - - return "\n\n".join(formatted_messages) - - def multi_challenge_prompt(line, task_name: str = None): - """Convert dataset to Doc object""" - - conversation = line["CONVERSATION"] - formatted_conv = format_conversation(conversation) + """Stub prompt function for inspect-ai-only task (not used by inspect-ai backend).""" return Doc( task_name=task_name, - query=formatted_conv, - instruction=None, - specific={ - "question_id": line["QUESTION_ID"], - "axis": line["AXIS"], - "target_question": line["TARGET_QUESTION"], - "pass_criteria": line["PASS_CRITERIA"], - "conversation": conversation, - }, + query="", + choices=[], + gold_index=0, ) -base_scorer = model_graded_fact( - template=JUDGE_PROMPT, - grade_pattern=r"\b(YES|NO)\b", - model="openai/gpt-4o-2024-08-06", -) - - @scorer(metrics=[accuracy(), stderr()]) def multi_challenge_scorer(): + base_scorer = model_graded_fact( + template=JUDGE_PROMPT, + grade_pattern=r"\b(YES|NO)\b", + model="openai/gpt-4o-2024-08-06", + ) + async def score(state: TaskState, target: Target): score = await base_scorer(state, target) judge_verdict = score.value.upper() if score.value else None @@ -98,7 +83,14 @@ async def score(state: TaskState, target: Target): explanation=f"Could not extract valid verdict from judge output: {score.explanation}", ) - pass_criteria = state.metadata.get("pass_criteria", "YES") + pass_criteria = state.metadata.get("pass_criteria", "") + if pass_criteria not in ["YES", "NO"]: + return Score( + value="I", + answer=score.answer, + explanation=f"Invalid pass criteria: {pass_criteria}", + ) + passed = judge_verdict == pass_criteria return Score( @@ -117,7 +109,8 @@ def conversation_solver(): async def solve(state: TaskState, generate: Generate): conversation = state.metadata.get("conversation", []) - state.messages = [] + if not hasattr(state, "messages") or state.messages is None: + state.messages = [] for msg in conversation: role = msg["role"].lower() @@ -127,6 +120,8 @@ async def solve(state: TaskState, generate: Generate): state.messages.append(ChatMessageUser(content=content)) elif role == "assistant": state.messages.append(ChatMessageAssistant(content=content)) + else: + raise ValueError(f"Unsupported role: {role} in conversation.") return state