Source code for oumi.core.configs.params.evaluation_params
# Copyright 2025 - Oumi
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from dataclasses import dataclass, field
from enum import Enum
from typing import Any, Optional
from oumi.core.configs.params.base_params import BaseParams
[docs]
class EvaluationBackend(Enum):
"""Enum representing the evaluation backend to use."""
LM_HARNESS = "lm_harness"
ALPACA_EVAL = "alpaca_eval"
CUSTOM = "custom"
[docs]
@dataclass
class EvaluationTaskParams(BaseParams):
"""Configuration parameters for model evaluation tasks.
Supported backends:
- LM Harness: Framework for evaluating language models on standard benchmarks.
A list of all supported tasks can be found at:
https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks.
- Alpaca Eval: Framework for evaluating language models on instruction-following
and quality of responses on open-ended questions.
- Custom: Users can register their own evaluation functions using the decorator
`@register_evaluation_function`. The task_name should be the registry key for
the custom evaluation function to be used.
Examples:
.. code-block:: python
# LM Harness evaluation on MMLU
params = EvaluationTaskParams(
evaluation_backend="lm_harness",
task_name="mmlu",
eval_kwargs={"num_fewshot": 5}
)
.. code-block:: python
# Alpaca Eval 2.0 evaluation
params = EvaluationTaskParams(
evaluation_backend="alpaca_eval"
)
.. code-block:: python
# Custom evaluation
@register_evaluation_function("my_evaluation_function")
def my_evaluation(task_params, config):
accuracy = ...
return EvaluationResult(task_result={"accuracy": accuracy})
params = EvaluationTaskParams(
task_name="my_evaluation_function",
evaluation_backend="custom"
)
"""
evaluation_backend: str = ""
"""The evaluation backend to use for the current task."""
task_name: Optional[str] = None
"""The task to evaluate or the custom evaluation function to use.
For LM Harness evaluations (when the evaluation_backend is set to
EvaluationBackend.LM_HARNESS), the `task_name` corresponds to a predefined task
to evaluate on (e.g. "mmlu"). A list of all supported tasks by the LM Harness
backend can be found by running: `lm-eval --tasks list`.
For custom evaluations (when evaluation_backend is set to EvaluationBackend.CUSTOM),
the `task_name` should be the registry key for the custom evaluation function to be
used. Users can register new evaluation functions using the decorator
`@register_evaluation_function`.
"""
num_samples: Optional[int] = None
"""Number of samples/examples to evaluate from this dataset.
Mostly for debugging, in order to reduce the runtime.
If not set (None): the entire dataset is evaluated.
If set, this must be a positive integer.
"""
log_samples: Optional[bool] = False
"""Whether to log the samples used for evaluation.
If not set (False): the model samples used for evaluation will not be logged.
If set to True: the model samples generated during inference and used for
evaluation will be logged in `backend_config.json`. The backend may also log
other intermediate results related to inference.
"""
eval_kwargs: dict[str, Any] = field(default_factory=dict)
"""Additional keyword arguments to pass to the evaluation function.
This allows for passing any evaluation-specific parameters that are not
covered by other fields in TaskParams classes.
"""
[docs]
def get_evaluation_backend(self) -> EvaluationBackend:
"""Returns the evaluation backend as an Enum."""
if not self.evaluation_backend:
raise ValueError(
"Missing `evaluation_backend`. When running evaluations, it is "
"necessary to specify the evaluation backend to use for EACH task. "
"The available backends can be found in the following enum: "
"`oumi.core.configs.params.evaluation_params.EvaluationBackend`. "
f"Current options: {EvaluationTaskParams.list_evaluation_backends()}."
)
elif self.evaluation_backend == EvaluationBackend.LM_HARNESS.value:
return EvaluationBackend.LM_HARNESS
elif self.evaluation_backend == EvaluationBackend.ALPACA_EVAL.value:
return EvaluationBackend.ALPACA_EVAL
elif self.evaluation_backend == EvaluationBackend.CUSTOM.value:
return EvaluationBackend.CUSTOM
else:
raise ValueError(f"Unknown evaluation backend: {self.evaluation_backend}")
[docs]
@staticmethod
def list_evaluation_backends() -> str:
"""Returns a string listing all available evaluation backends."""
return ", ".join([backend.value for backend in EvaluationBackend])
[docs]
def __post_init__(self):
"""Verifies params."""
if self.num_samples is not None and self.num_samples <= 0:
raise ValueError("`num_samples` must be None or a positive integer.")
[docs]
@dataclass
class LMHarnessTaskParams(EvaluationTaskParams):
"""Parameters for the LM Harness evaluation framework.
LM Harness is a comprehensive benchmarking suite for evaluating language models
across various tasks.
"""
num_fewshot: Optional[int] = None
"""Number of few-shot examples (with responses) to add in the prompt, in order to
teach the model how to respond to the specific dataset's prompts.
If not set (None): LM Harness will decide the value.
If set to 0: no few-shot examples will be added in the prompt.
"""
[docs]
def __post_init__(self):
"""Verifies params."""
if not self.task_name:
raise ValueError("`task_name` must be a valid LM Harness task.")
if self.num_fewshot and self.num_fewshot < 0:
raise ValueError("`num_fewshot` must be non-negative.")
[docs]
@dataclass
class AlpacaEvalTaskParams(EvaluationTaskParams):
"""Parameters for the AlpacaEval evaluation framework.
AlpacaEval is an LLM-based automatic evaluation suite that is fast, cheap,
replicable, and validated against 20K human annotations. The latest version
(AlpacaEval 2.0) contains 805 prompts (tatsu-lab/alpaca_eval), which are open-ended
questions. A model annotator (judge) is used to evaluate the quality of model's
responses for these questions and calculates win rates vs. reference responses.
The default judge is GPT4 Turbo.
"""
version: Optional[float] = 2.0
"""The version of AlpacaEval to use. Options: 1.0 or 2.0 (default)."""
[docs]
def __post_init__(self):
"""Verifies params."""
if self.version not in [1.0, 2.0]:
raise ValueError("AlpacaEval `version` must be 1.0 or 2.0.")