Source code for langcheck.metrics.custom_text_quality

from __future__ import annotations

from pathlib import Path

from jinja2 import Template

from langcheck.metrics._pairwise_text_quality_utils import (
    compute_pairwise_comparison_metric_values_with_consistency,
)
from langcheck.metrics.eval_clients import EvalClient
from langcheck.metrics.metric_inputs import (
    IndividualInputType,
    get_metric_inputs,
)
from langcheck.metrics.metric_value import MetricValue



[docs]
def custom_evaluator(
    generated_outputs: list[str] | str | None,
    prompts: list[str] | str | None,
    sources: list[str] | str | None,
    reference_outputs: list[str] | str | None,
    eval_model: EvalClient,
    metric_name: str,
    score_map: dict[str, float],
    template_path: str | None,
    language: str,
    template_str: str | None = None,
    *,
    additional_inputs: dict[str, IndividualInputType] | None = None,
    additional_input_name_to_prompt_var_mapping: dict[str, str] | None = None,
) -> MetricValue[float | None]:
    """Calculates the scores of a custom evaluator. The EvalClient will first
    assess the provided inputs using the prompt template, and then convert those
    assessments into scores using the score map.

    The prompt template should be a Jinja2 file (file extension .j2) that
    specifies the criteria that an LLM (as configured in the Eval Client) should
    follow when evaluating an instance. The template is allowed to have
    placeholders for the following variables (NOTE: not all are required):
    - `gen_output`: The generated output
    - `user_query`: The prompt
    - `src`: The source text
    - `ref_output`: The reference output

    By specifying additional inputs, the prompt template can be more flexible.
    The additional inputs should be passed as a dictionary, where the keys are
    the input names and the values are the corresponding values. The additional
    inputs can be mapped to variable names in the prompt template using the
    `additional_input_name_to_prompt_var_mapping` dictionary.

    The prompt template should also specify the final available assessments for
    the LLM evaluator, e.g. "Good", "Bad", "Neutral", etc. The score map should
    then map each of those available assessments to a numerical score. E.g. if
    the available assessments in the prompt template are "Good", "Bad", and
    "Neutral", the score map should be something like:
    ``score_map = {'Good': 1.0, 'Neutral': 0.5, 'Bad': 0.0}``

    NOTE: We have found that LLM models sometimes behave weirdly when the
    assessments are non-ascii characters (see
    https://github.com/citadel-ai/langcheck/pull/84 as an example). So, we
    recommend making the final assessments ascii characters, even when the rest
    of the prompt template contains non-ascii characters (e.g. Japanese).

    Args:
        generated_outputs: The model generated output(s)
        prompts: The prompts used to generate the output(s)
        sources: The source(s) of the generated output(s)
        reference_outputs: The reference output(s)
        eval_model: The EvalClient instance used for the evaluation
        metric_name: The name of the metric
        score_map: A dictionary mapping the evaluator's assessments to scores
        template_path: The path to the prompt template file. This should be a
            Jinja2 file (file extension .j2).
        language: The language that the evaluator will use ('en', 'ja', or 'de')
        template_str: The prompt template string. This should be a Jinja2 template string.
            If provided, template_path will be ignored.
        additional_inputs: Additional inputs other than the standard ones.
        additional_input_name_to_prompt_var_mapping: A dictionary that maps the
            additional input names to the variable names in the prompt template.

    Returns:
        A MetricValue object
    """
    if language not in ["en", "ja", "de"]:
        raise ValueError(f"Unsupported language: {language}")

    metric_inputs = get_metric_inputs(
        generated_outputs=generated_outputs,
        prompts=prompts,
        sources=sources,
        reference_outputs=reference_outputs,
        additional_inputs=additional_inputs,
        additional_input_name_to_prompt_var_mapping=additional_input_name_to_prompt_var_mapping,
        required_params=[],
    )

    prompt_template_source = _get_template_str(template_path, template_str)

    metric_inputs.validate_template(prompt_template_source)
    prompt_template = Template(prompt_template_source)

    return eval_model.compute_metric_values_from_template(
        metric_inputs=metric_inputs,
        template=prompt_template,
        metric_name=metric_name,
        language=language,
        score_map=score_map,
    )




[docs]
def custom_pairwise_evaluator(
    generated_outputs_a: list[str] | str | None,
    generated_outputs_b: list[str] | str | None,
    prompts: list[str] | str | None,
    sources_a: list[str] | str | None,
    sources_b: list[str] | str | None,
    reference_outputs: list[str] | str | None,
    eval_model: EvalClient,
    metric_name: str,
    score_map: dict[str, float],
    template_path: str | None,
    language: str,
    enforce_consistency: bool = True,
    template_str: str | None = None,
    *,
    additional_inputs: dict[
        str,
        IndividualInputType | tuple[IndividualInputType, IndividualInputType],
    ]
    | None = None,
    additional_input_name_to_prompt_var_mapping: dict[str, str] | None = None,
) -> MetricValue[float | None]:
    """Calculates the scores of a custom pairwise evaluator, where "pairwise"
    means that the Responses and/or Sources of two systems will be compared
    against each other. The EvalClient will first assess the provided inputs
    using the prompt template, and then convert those assessments into scores
    using the score map.

    The prompt template should be a Jinja2 file (file extension .j2) that
    specifies the criteria that an LLM (as configured in the Eval Client) should
    follow when evaluating an instance. The template is allowed to have
    placeholders for the following variables (NOTE: not all are required):
    - `gen_output_a`: Model A's generated output
    - `gen_output_b`: Model B's generated output
    - `user_query`: The prompt
    - `src_a`: The source text for Model A
    - `src_b`: The source text for Model B
    - `ref_output`: The reference output

    The prompt template should also specify the final available assessments for
    the LLM evaluator, e.g. "Response A", "Response B", "Tie", etc. The score
    map should then map each of those available assessments to a numerical
    score. E.g. if the available assessments in the prompt template are
    "Response A", "Response B", and "Tie", the score map should be something
    like:
    ``score_map = {'Response A': 0.0, 'Response B': 1.0, 'Tie': 0.5}``

    NOTE: If `enforce_consistency` is True, please make sure that the score map
    is symmetric, in the sense that swapping Model A and Model B should result
    in inverse scores. See the code below for more details.

    NOTE: We have found that LLM models sometimes behave weirdly when the
    assessments are non-ascii characters (see
    https://github.com/citadel-ai/langcheck/pull/84 as an example). So, we
    recommend making the final assessments ascii characters, even when the rest
    of the prompt template contains non-ascii characters (e.g. Japanese).

    Args:
        generated_outputs_a: Model A's generated output(s)
        generated_outputs_b: Model B's generated output(s)
        prompts: The prompts used to generate the output(s)
        sources_a: The source(s) for Model A's generated output(s)
        sources_b: The source(s) for Model B's generated output(s)
        reference_outputs: The reference output(s)
        eval_model: The EvalClient instance used for the evaluation
        metric_name: The name of the metric
        score_map: A dictionary mapping the evaluator's assessments to scores
        template_path: The path to the prompt template file. This should be a
            Jinja2 file (file extension .j2).
        language: The language that the evaluator will use ('en', 'ja', or 'de')
        enforce_consistency: When this is True, we will only return a score if
            the score is the same when Model A and Model B are swapped. This is
            useful for ensuring that the evaluator's position bias is not
            impacting the scores. Default True.
        template_str: The prompt template string. This should be a Jinja2 template string.
            If provided, template_path will be ignored.

    Returns:
        A MetricValue object
    """

    if language not in ["en", "ja", "de"]:
        raise ValueError(f"Unsupported language: {language}")

    metric_inputs = get_metric_inputs(
        generated_outputs=(generated_outputs_a, generated_outputs_b),
        prompts=prompts,
        sources=(sources_a, sources_b),
        reference_outputs=reference_outputs,
        additional_inputs=additional_inputs,
        additional_input_name_to_prompt_var_mapping=additional_input_name_to_prompt_var_mapping,
        required_params=[],
    )

    prompt_template_source = _get_template_str(template_path, template_str)

    metric_inputs.validate_template(prompt_template_source)
    prompt_template = Template(prompt_template_source)

    if enforce_consistency:
        return compute_pairwise_comparison_metric_values_with_consistency(
            eval_client=eval_model,
            metric_inputs=metric_inputs,
            template=prompt_template,
            metric_name=metric_name,
            language=language,
            score_map=score_map,
        )
    else:
        return eval_model.compute_metric_values_from_template(
            metric_inputs=metric_inputs,
            template=prompt_template,
            metric_name=metric_name,
            language=language,
            score_map=score_map,
        )



def _get_template_str(
    template_path: str | None, template_str: str | None
) -> str:
    """Gets the template string from the template path or template string."""
    if template_str is not None:
        return template_str
    else:
        assert template_path is not None, (
            "Either template_path or template_str must be provided."
        )
        assert Path(template_path).exists(), (
            f"Prompt template file {template_path} does not exist."
        )
        assert template_path.endswith(".j2"), (
            'The prompt template file must be a Jinja2 template file with the extension ".j2"'
        )
        return Path(template_path).read_text(encoding="utf-8")