Source code for langcheck.metrics.de.source_based_text_quality

from __future__ import annotations

from typing import Dict, List, Optional, Tuple

from openai import OpenAI

from langcheck.metrics._validation import (
    validate_parameters_context_relevance, validate_parameters_source_based)
from langcheck.metrics.de._translation import Translate
from langcheck.metrics.en._openai import OpenAIBasedEvaluator
from langcheck.metrics.en.source_based_text_quality import \
    factual_consistency as en_factual_consistency
from langcheck.metrics.metric_value import MetricValue
from langcheck.utils.progess_bar import tqdm_wrapper

_factual_consistency_translation_model_path = 'Helsinki-NLP/opus-mt-de-en'

LANG = 'de'


[docs]def factual_consistency(
        generated_outputs: List[str] | str,
        sources: List[str] | str,
        prompts: Optional[List[str] | str] = None,
        model_type: str = 'local',
        openai_client: Optional[OpenAI] = None,
        openai_args: Optional[Dict[str, str]] = None,
        *,
        use_async: bool = False) -> MetricValue[Optional[float]]:
    '''Calculates the factual consistency between the generated outputs and
    the sources. This metric takes on float values between [0, 1], where 0
    means that the output is not at all consistent with the source text, and 1
    means that the output is fully consistent with the source text. (NOTE: when
    using the OpenAI model, the factuality scores are either 0.0, 0.5, or 1.0.
    The score may also be `None` if it could not be computed.)

    We currently support three model types:

    1. The 'local' type, where the 'unieval-fact' model is downloaded
    from HuggingFace and run locally. This is the default model type and
    there is no setup needed to run this.
    This function wraps :func:`~langcheck.metrics.en.factual_consistency`
    using the translation model ``Helsinki-NLP/opus-mt-de-en`` to translate the
    German texts to English before computing the factual consistency
    scores. This is because the UniEval-fact model is trained on English
    text.

    2. The 'openai' type, where we use OpenAI's 'gpt-turbo-3.5' model
    by default. While the model you use is configurable, please make sure to use
    one that supports function calling
    (https://platform.openai.com/docs/guides/gpt/function-calling). See
    `this example <https://langcheck.readthedocs.io/en/latest/metrics.html
    #computing-metrics-with-openai-models>`__
    for examples on setting up the OpenAI API key.

    3. The 'azure_openai' type. Essentially the same as the 'openai' type,
    except that it uses the AzureOpenAI client. Note that you must specify your
    model deployment to use in ``openai_args``, e.g.
    ``openai_args={'model': 'YOUR_DEPLOYMENT_NAME'}``

    Args:
        generated_outputs: The model generated output(s) to evaluate
        sources: The source text(s), one string per generated output
        prompts: The prompts used to generate the output(s). Prompts are
            optional metadata and not used to calculate the metric.
        model_type: The type of model to use ('local', 'openai', or
            'azure_openai'), default 'local'
        openai_client: OpenAI or AzureOpenAI client, default None. If this is
            None but ``model_type`` is 'openai' or 'azure_openai', we will
            attempt to create a default client.
        openai_args: Dict of additional args to pass in to the
            ``client.chat.completions.create`` function, default None
        use_async: Whether to use the asynchronous API for OpenAI, default False

    Returns:
        An MetricValue object
    '''
    generated_outputs, sources, prompts = validate_parameters_source_based(
        generated_outputs, sources, prompts)
    assert model_type in [
        'local', 'openai', 'azure_openai'
    ], ('Unsupported model type. '
        'The supported ones are ["local", "openai", "azure_openai"]')

    if model_type == 'openai' or model_type == 'azure_openai':
        scores, explanations = _factual_consistency_openai(generated_outputs,
                                                           sources,
                                                           model_type,
                                                           openai_client,
                                                           openai_args,
                                                           use_async=use_async)

        return MetricValue(metric_name='factual_consistency',
                           prompts=prompts,
                           generated_outputs=generated_outputs,
                           reference_outputs=None,
                           sources=sources,
                           explanations=explanations,
                           metric_values=scores,
                           language=LANG)

    # Translate the sources and generated outputs to English.
    # Currently, the type checks are not working for the pipeline, since
    # too diverse types can be returned.
    translation = Translate(_factual_consistency_translation_model_path)
    batch_size = 8
    en_source = []
    for i in tqdm_wrapper(range(0, len(sources), batch_size),
                          desc='Translating sources',
                          total=(len(sources) + batch_size - 1) // batch_size):
        batch_sources = sources[i:i + batch_size]
        en_source.extend([translation(src) for src in batch_sources])
    en_generated_outputs = []
    for i in tqdm_wrapper(range(0, len(generated_outputs), batch_size),
                          desc='Translating generated outputs',
                          total=(len(generated_outputs) + batch_size - 1) //
                          batch_size):
        batch_generated_outputs = generated_outputs[i:i + batch_size]
        en_generated_outputs.extend(
            [translation(gen_out) for gen_out in batch_generated_outputs])

    # Compute the factual consistency scores in English.
    metric_value = en_factual_consistency(
        generated_outputs=en_generated_outputs, sources=en_source)
    metric_value.language = LANG
    return metric_value


def _factual_consistency_openai(
    generated_outputs: List[str],
    sources: List[str],
    client_type: str,
    client: Optional[OpenAI],
    openai_args: Optional[Dict[str, str]],
    *,
    use_async: bool = False
) -> Tuple[List[Optional[float]], List[Optional[str]]]:
    '''Calculates the factual consistency and their associated explanations
    between each generated output and its corresponding source text. The
    consistency is computed by calling the OpenAI API, with a prompt similar to
    the one used in OpenAI Evals. We leverage the function calling API to make
    sure that the output is structured such that we can compute a score. If a
    score could not be computed, `None` is inserted to the score and explanation
    lists.

    Ref:
        https://github.com/openai/evals/blob/e49868e550babb7b1c5b4223c9b7a14511bf114d/evals/registry/modelgraded/fact.yaml
        https://platform.openai.com/docs/guides/gpt/function-calling

    Args:
        generated_outputs: The model generated output(s) to evaluate
        sources: The source text(s), one string per generated output
        client_type: The type of OpenAI client ('openai' or 'azure_openai')
        client: (Optional) OpenAI or AzureOpenAI client. If this is None, we
            will attempt to create a default client depending on the
            ``client_type``.
        openai_args: (Optional) Dict of additional args to pass in to the
            ``client.chat.completions.create`` function
        use_async: Whether to use the asynchronous API for OpenAI

    Returns:
        score_list: a list of scores
        explanation_list: a list of explanations for the scores
    '''

    # TODO: The prompt formation, and the scoring system, can do with some
    # improvement. There are some cases where consistent outputs get incorrectly
    # assessed as "Partially Consistent", and there's no differentiation
    # between an output that is unrelated to the source and an output that is
    # straight up contradictory.
    def _prompt(src: str, gen_output: str) -> str:
        return f'''
        Sie bewerten die faktische Konsistenz einer eingereichten Behauptung.
        Hier sind die Daten:
        [BEGINN DER DATEN]
        ************
        [Quelle]: {src}
        ************
        [Benutzeranfrage]: {gen_output}
        ************
        [ENDE DER DATEN]

        Bestimmen Sie, ob die eingereichte Behauptung faktisch konsistent mit
        der Quelle ist. Die verfügbaren Bewertungen sind:
        `Vollständig Konsistent` - Die eingereichte Behauptung ist vollständig
        faktisch konsistent mit dem Quelltext.
        `Teilweise Konsistent` - Die eingereichte Behauptung ist teilweise
        faktisch konsistent mit dem Quelltext. Es gibt einige Aspekte der
        Behauptung, die faktisch konsistent sind, aber auch einige, die es
         nicht sind.
        `Nicht Konsistent` - Die eingereichte Behauptung ist nicht faktisch
        konsistent mit dem Quelltext.

        Atmen Sie tief durch und bearbeiten Sie dieses Problem Schritt für
        Schritt.
        '''

    def _function_call_prompt(long_assessment: str) -> str:
        return f'''
        Folgendes ist eine Bewertung zur faktischen Konsistenz einer Behauptung:
        ************
        [Bewertung]: {long_assessment}
        ************

        Speichern Sie die resultierende Bewertung. Die verfügbaren Bewertungen
         sind:
        `Vollständig Konsistent`
        `Teilweise Konsistent`
        `Nicht Konsistent`
        '''

    factuality_assessment_to_score = {
        'Vollständig Konsistent': 1.0,
        'Teilweise Konsistent': 0.5,
        'Nicht Konsistent': 0.0
    }
    oai_evaluator = OpenAIBasedEvaluator(
        assessment_to_score_mapping=factuality_assessment_to_score,
        function_name='save_factual_consistency_assessment',
        function_description=(
            "Saves a submitted claim's factual consistency assessment."),
        argument_name='factuality',
        argument_description='The factual consistency assessment of the claim',
        client_type=client_type,
        client=client,
        openai_args=openai_args,
        use_async=use_async)

    scores, explanations = oai_evaluator.get_score(
        map(_prompt, sources, generated_outputs), _function_call_prompt)

    return scores, explanations


[docs]def context_relevance(sources: List[str] | str,
                      prompts: List[str] | str,
                      model_type: str = 'openai',
                      openai_client: Optional[OpenAI] = None,
                      openai_args: Optional[Dict[str, str]] = None,
                      *,
                      use_async: bool = False) -> MetricValue[Optional[float]]:
    '''Calculates the relevance of the sources to the prompts. This metric takes
    on float values between [0, 1], where 0 means that the source text is not at
    all relevant to the prompt, and 1 means that the source text is fully
    relevant to the prompt.

    We currently support two model types:

    1. The 'openai' type, where we use OpenAI's 'gpt-turbo-3.5' model
    by default. While the model you use is configurable, please make sure to use
    one that supports function calling
    (https://platform.openai.com/docs/guides/gpt/function-calling). See
    `this page <https://langcheck.readthedocs.io/en/latest/metrics.html
    #computing-metrics-with-openai-models>`__
    for examples on setting up the OpenAI API key.

    2. The 'azure_openai' type. Essentially the same as the 'openai' type,
    except that it uses the AzureOpenAI client. Note that you must specify your
    model deployment to use in ``openai_args``, e.g.
    ``openai_args={'model': 'YOUR_DEPLOYMENT_NAME'}``

    Args:
        sources: The source text(s), one string per prompt
        prompts: The prompt(s)
        model_type: The type of model to use ('openai' or 'azure_openai'),
            default 'openai'
        openai_client: OpenAI or AzureOpenAI client, default None. If this is
            None, we will attempt to create a default client.
        openai_args: Dict of additional args to pass in to the
            ``client.chat.completions.create`` function, default None
        use_async: Whether to use the asynchronous API for OpenAI, default False
    '''
    prompts, sources = validate_parameters_context_relevance(prompts, sources)

    def _prompt(src: str, user_query: str) -> str:
        return f'''
        Sie bewerten die Relevanz der Quelle für eine Benutzeranfrage. Hier
        sind die Daten:
        [BEGINN DATEN]
        ************
        [Quelle]: {src}
        ************
        [Benutzeranfrage]: {user_query}
        ************
        [ENDE DATEN]

        Bestimmen Sie, ob die Quelle die relevanten und notwendigen
        Informationen enthält, um auf die Anfrage des Benutzers zu antworten.
        Die verfügbaren Bewertungen sind:
        `Vollständig relevant` - Der Quelltext enthält die Informationen, die
        notwendig sind, um auf die Anfrage des Benutzers zu antworten.
        `Teilweise relevant` - Der Quelltext ist teilweise relevant für die
        Anfrage des Benutzers, enthält aber nicht alle Informationen,
        die notwendig sind, um auf die Anfrage des Benutzers zu antworten.
        `Nicht relevant` - Der Quelltext ist nicht relevant für die Anfrage des
        Benutzers.

        Atmen Sie tief durch und arbeiten Sie Schritt für Schritt an diesem
        Problem.
        '''

    def _function_call_prompt(long_assessment: str) -> str:
        return f'''
        Folgendes ist eine Bewertung über die Relevanz einer Quelle:
        ************
        [Bewertung]: {long_assessment}
        ************

        Speichern Sie die resultierende Bewertung. Die verfügbaren Bewertungen
        sind:
        `Vollständig relevant`
        `Teilweise relevant`
        `Nicht relevant`
        '''

    context_relevance_assessment_to_score = {
        'Vollständig relevant': 1.0,
        'Teilweise relevant': 0.5,
        'Nicht relevant': 0.0
    }
    oai_evaluator = OpenAIBasedEvaluator(
        assessment_to_score_mapping=context_relevance_assessment_to_score,
        function_name='save_context_relevance_assessment',
        function_description=("Saves a context relevance assessment."),
        argument_name='context_relevance',
        argument_description='The context relevance assessment',
        client_type=model_type,
        client=openai_client,
        openai_args=openai_args,
        use_async=use_async)

    scores, explanations = oai_evaluator.get_score(
        map(_prompt, sources, prompts), _function_call_prompt)

    return MetricValue(metric_name='context_relevance',
                       prompts=prompts,
                       generated_outputs=None,
                       reference_outputs=None,
                       sources=list(sources),
                       explanations=explanations,
                       metric_values=scores,
                       language=LANG)