Source code for langcheck.metrics.eval_clients._anthropic

from __future__ import annotations

import asyncio
from collections.abc import Iterable
from typing import Any

from anthropic import Anthropic, AsyncAnthropic

from langcheck.utils.progress_bar import tqdm_wrapper

from ..prompts._utils import get_template
from ._base import EvalClient



[docs]
class AnthropicEvalClient(EvalClient):
    """EvalClient defined for Anthropic API."""

    def __init__(
        self,
        anthropic_client: Anthropic | None = None,
        anthropic_args: dict[str, Any] | None = None,
        *,
        use_async: bool = False,
    ):
        """
        Initialize the Anthropic evaluation client. The authentication
        information is automatically read from the environment variables,
        so please make sure ANTHROPIC_API_KEY is set.

        Args:
            anthropic_client: (Optional) The Anthropic client to use.
            anthropic_args: (Optional) dict of additional args to pass in to
                the ``client.messages.create`` function
            use_async: (Optional) If True, the async client will be used.
        """
        if anthropic_client:
            self._client = anthropic_client
        elif use_async:
            self._client = AsyncAnthropic()
        else:
            self._client = Anthropic()

        self._anthropic_args = anthropic_args or {}
        self._use_async = use_async

    def _call_api(
        self,
        prompts: Iterable[str | None],
        config: dict[str, Any],
        *,
        tqdm_description: str | None = None,
    ) -> list[Any]:
        # A helper function to call the API with exception filter for alignment
        # of exception handling with the async version.
        def _call_api_with_exception_filter(model_input: dict[str, Any]) -> Any:
            if model_input is None:
                return None
            try:
                return self._client.messages.create(**model_input)
            except Exception as e:
                return e

        model_inputs = [
            {"messages": [{"role": "user", "content": prompt}], **config}
            for prompt in prompts
        ]

        if self._use_async:
            # A helper function to call the async API.
            async def _call_async_api() -> list[Any]:
                responses = await asyncio.gather(
                    *map(
                        lambda model_input: self._client.messages.create(
                            **model_input
                        ),
                        model_inputs,
                    ),
                    return_exceptions=True,
                )
                return responses

            responses = asyncio.run(_call_async_api())
        else:
            responses = [
                _call_api_with_exception_filter(model_input)
                for model_input in tqdm_wrapper(
                    model_inputs, desc=tqdm_description
                )
            ]

        # Filter out exceptions and print them out.
        for i, response in enumerate(responses):
            if not isinstance(response, Exception):
                continue
            print(
                "Anthropic failed to return an assessment corresponding to "
                f"{i}th prompt: {response}"
            )
            responses[i] = None
        return responses


[docs]
    def get_text_responses(
        self, prompts: Iterable[str], *, tqdm_description: str | None = None
    ) -> list[str | None]:
        """The function that gets responses to the given prompt texts.
        We use Anthropic's 'claude-3-haiku-20240307' model by default, but you
        can configure it by passing the 'model' parameter in the anthropic_args.

        Args:
            prompts: The prompts you want to get the responses for.

        Returns:
            A list of responses to the prompts. The responses can be None if the
            evaluation fails.
        """
        config = {
            "model": "claude-3-haiku-20240307",
            "max_tokens": 4096,
            "temperature": 0.0,
        }
        config.update(self._anthropic_args or {})
        tqdm_description = tqdm_description or "Intermediate assessments (1/2)"
        responses = self._call_api(
            prompts=prompts, config=config, tqdm_description=tqdm_description
        )
        response_texts = [
            response.content[0].text if response else None
            for response in responses
        ]

        return response_texts



[docs]
    def get_float_score(
        self,
        metric_name: str,
        language: str,
        unstructured_assessment_result: list[str | None],
        score_map: dict[str, float],
        *,
        tqdm_description: str | None = None,
    ) -> list[float | None]:
        """The function that transforms the unstructured assessments (i.e. long
        texts that describe the evaluation results) into scores.

        Args:
            metric_name : The name of the metric to be used. (e.g. "toxicity")
            language: The language of the prompts. (e.g. "en")
            unstructured_assessment_result: The unstructured assessment results
                for the given assessment prompts.
            score_map: The mapping from the short assessment results
                (e.g. "Good") to the scores.
            tqdm_description: The description to be shown in the tqdm bar.

        Returns:
            A list of scores for the given prompts. The scores can be None if
            the evaluation fails.
        """
        if language not in ["en", "ja", "de"]:
            raise ValueError(f"Unsupported language: {language}")

        options = list(score_map.keys())
        get_score_template = get_template(f"{language}/get_score/plain_text.j2")
        get_score_prompts = [
            get_score_template.render(
                {
                    "metric": metric_name,
                    "unstructured_assessment": unstructured_assessment,
                    "options": options,
                }
            )
            if unstructured_assessment
            else None
            for unstructured_assessment in unstructured_assessment_result
        ]

        config = {"model": "claude-3-haiku-20240307", "max_tokens": 1024}
        config.update(self._anthropic_args or {})
        tqdm_description = tqdm_description or "Scores (2/2)"
        responses = self._call_api(
            prompts=get_score_prompts,
            config=config,
            tqdm_description=tqdm_description,
        )
        raw_response_texts = [
            response.content[0].text if response else None
            for response in responses
        ]

        def _turn_to_score(response: str | None) -> float | None:
            if response is None:
                return None
            option_found = [option for option in options if option in response]
            # if response contains multiple options as substrings, return None
            if len(option_found) != 1:
                return None
            return score_map[option_found[0]]

        return [_turn_to_score(response) for response in raw_response_texts]



[docs]
    def similarity_scorer(self):
        raise NotImplementedError(
            "Embedding-based metrics are not supported in AnthropicEvalClient."
            "Use other EvalClients to get these metrics."
        )