Source code for langcheck.metrics.eval_clients._anthropic

from __future__ import annotations

import asyncio
from collections.abc import Iterable
from typing import Any

from anthropic import Anthropic, AsyncAnthropic

from langcheck.utils.progress_bar import tqdm_wrapper

from ..prompts._utils import get_template
from ._base import EvalClient


[docs] class AnthropicEvalClient(EvalClient): """EvalClient defined for Anthropic API.""" def __init__( self, anthropic_client: Anthropic | None = None, anthropic_args: dict[str, Any] | None = None, *, use_async: bool = False, ): """ Initialize the Anthropic evaluation client. The authentication information is automatically read from the environment variables, so please make sure ANTHROPIC_API_KEY is set. Args: anthropic_client: (Optional) The Anthropic client to use. anthropic_args: (Optional) dict of additional args to pass in to the ``client.messages.create`` function use_async: (Optional) If True, the async client will be used. """ if anthropic_client: self._client = anthropic_client elif use_async: self._client = AsyncAnthropic() else: self._client = Anthropic() self._anthropic_args = anthropic_args or {} self._use_async = use_async def _call_api( self, prompts: Iterable[str | None], config: dict[str, Any], *, tqdm_description: str | None = None, ) -> list[Any]: # A helper function to call the API with exception filter for alignment # of exception handling with the async version. def _call_api_with_exception_filter(model_input: dict[str, Any]) -> Any: if model_input is None: return None try: return self._client.messages.create(**model_input) except Exception as e: return e model_inputs = [ {"messages": [{"role": "user", "content": prompt}], **config} for prompt in prompts ] if self._use_async: # A helper function to call the async API. async def _call_async_api() -> list[Any]: responses = await asyncio.gather( *map( lambda model_input: self._client.messages.create( **model_input ), model_inputs, ), return_exceptions=True, ) return responses responses = asyncio.run(_call_async_api()) else: responses = [ _call_api_with_exception_filter(model_input) for model_input in tqdm_wrapper( model_inputs, desc=tqdm_description ) ] # Filter out exceptions and print them out. for i, response in enumerate(responses): if not isinstance(response, Exception): continue print( "Anthropic failed to return an assessment corresponding to " f"{i}th prompt: {response}" ) responses[i] = None return responses
[docs] def get_text_responses( self, prompts: Iterable[str], *, tqdm_description: str | None = None ) -> list[str | None]: """The function that gets responses to the given prompt texts. We use Anthropic's 'claude-3-haiku-20240307' model by default, but you can configure it by passing the 'model' parameter in the anthropic_args. Args: prompts: The prompts you want to get the responses for. Returns: A list of responses to the prompts. The responses can be None if the evaluation fails. """ config = { "model": "claude-3-haiku-20240307", "max_tokens": 4096, "temperature": 0.0, } config.update(self._anthropic_args or {}) tqdm_description = tqdm_description or "Intermediate assessments (1/2)" responses = self._call_api( prompts=prompts, config=config, tqdm_description=tqdm_description ) response_texts = [ response.content[0].text if response else None for response in responses ] return response_texts
[docs] def get_float_score( self, metric_name: str, language: str, unstructured_assessment_result: list[str | None], score_map: dict[str, float], *, tqdm_description: str | None = None, ) -> list[float | None]: """The function that transforms the unstructured assessments (i.e. long texts that describe the evaluation results) into scores. Args: metric_name : The name of the metric to be used. (e.g. "toxicity") language: The language of the prompts. (e.g. "en") unstructured_assessment_result: The unstructured assessment results for the given assessment prompts. score_map: The mapping from the short assessment results (e.g. "Good") to the scores. tqdm_description: The description to be shown in the tqdm bar. Returns: A list of scores for the given prompts. The scores can be None if the evaluation fails. """ if language not in ["en", "ja", "de"]: raise ValueError(f"Unsupported language: {language}") options = list(score_map.keys()) get_score_template = get_template(f"{language}/get_score/plain_text.j2") get_score_prompts = [ get_score_template.render( { "metric": metric_name, "unstructured_assessment": unstructured_assessment, "options": options, } ) if unstructured_assessment else None for unstructured_assessment in unstructured_assessment_result ] config = {"model": "claude-3-haiku-20240307", "max_tokens": 1024} config.update(self._anthropic_args or {}) tqdm_description = tqdm_description or "Scores (2/2)" responses = self._call_api( prompts=get_score_prompts, config=config, tqdm_description=tqdm_description, ) raw_response_texts = [ response.content[0].text if response else None for response in responses ] def _turn_to_score(response: str | None) -> float | None: if response is None: return None option_found = [option for option in options if option in response] # if response contains multiple options as substrings, return None if len(option_found) != 1: return None return score_map[option_found[0]] return [_turn_to_score(response) for response in raw_response_texts]
[docs] def similarity_scorer(self): raise NotImplementedError( "Embedding-based metrics are not supported in AnthropicEvalClient." "Use other EvalClients to get these metrics." )