from __future__ import annotations
import asyncio
from typing import Any, Iterable
from anthropic import Anthropic, AsyncAnthropic
from langcheck.utils.progess_bar import tqdm_wrapper
from ..prompts._utils import get_template
from ._base import EvalClient
[docs]class AnthropicEvalClient(EvalClient):
'''EvalClient defined for Anthropic API.
'''
def __init__(self,
anthropic_client: Anthropic | None = None,
anthropic_args: dict[str, Any] | None = None,
*,
use_async: bool = False):
'''
Initialize the Anthropic evaluation client. The authentication
information is automatically read from the environment variables,
so please make sure ANTHROPIC_API_KEY is set.
Args:
anthropic_client: (Optional) The Anthropic client to use.
anthropic_args: (Optional) dict of additional args to pass in to
the ``client.messages.create`` function
use_async: (Optional) If True, the async client will be used.
'''
if anthropic_client:
self._client = anthropic_client
elif use_async:
self._client = AsyncAnthropic()
else:
self._client = Anthropic()
self._anthropic_args = anthropic_args or {}
self._use_async = use_async
def _call_api(self,
prompts: Iterable[str | None],
config: dict[str, Any],
*,
tqdm_description: str | None = None) -> list[Any]:
# A helper function to call the API with exception filter for alignment
# of exception handling with the async version.
def _call_api_with_exception_filter(model_input: dict[str, Any]) -> Any:
if model_input is None:
return None
try:
return self._client.messages.create(**model_input)
except Exception as e:
return e
model_inputs = [{
"messages": [{
"role": "user",
"content": prompt
}],
**config
} for prompt in prompts]
if self._use_async:
# A helper function to call the async API.
async def _call_async_api() -> list[Any]:
responses = await asyncio.gather(*map(
lambda model_input: self._client.messages.create(
**model_input), model_inputs),
return_exceptions=True)
return responses
responses = asyncio.run(_call_async_api())
else:
responses = [
_call_api_with_exception_filter(model_input)
for model_input in tqdm_wrapper(model_inputs,
desc=tqdm_description)
]
# Filter out exceptions and print them out.
for i, response in enumerate(responses):
if not isinstance(response, Exception):
continue
print('Anthropic failed to return an assessment corresponding to '
f'{i}th prompt: {response}')
responses[i] = None
return responses
[docs] def get_text_responses(
self,
prompts: Iterable[str],
*,
tqdm_description: str | None = None) -> list[str | None]:
'''The function that gets resonses to the given prompt texts.
We use Anthropic's 'claude-3-haiku-20240307' model by default, but you
can configure it by passing the 'model' parameter in the anthropic_args.
Args:
prompts: The prompts you want to get the responses for.
Returns:
A list of responses to the prompts. The responses can be None if the
evaluation fails.
'''
config = {
"model": "claude-3-haiku-20240307",
"max_tokens": 4096,
"temperature": 0.0
}
config.update(self._anthropic_args or {})
tqdm_description = tqdm_description or 'Intermediate assessments (1/2)' # NOQA: E501
responses = self._call_api(prompts=prompts,
config=config,
tqdm_description=tqdm_description)
response_texts = [
response.content[0].text if response else None
for response in responses
]
return response_texts
[docs] def get_float_score(
self,
metric_name: str,
language: str,
unstructured_assessment_result: list[str | None],
score_map: dict[str, float],
*,
tqdm_description: str | None = None) -> list[float | None]:
'''The function that transforms the unstructured assessments (i.e. long
texts that describe the evaluation results) into scores.
Args:
metric_name : The name of the metric to be used. (e.g. "toxicity")
language: The language of the prompts. (e.g. "en")
unstructured_assessment_result: The unstructured assessment results
for the given assessment prompts.
score_map: The mapping from the short assessment results
(e.g. "Good") to the scores.
tqdm_description: The description to be shown in the tqdm bar.
Returns:
A list of scores for the given prompts. The scores can be None if
the evaluation fails.
'''
if language not in ['en', 'ja', 'de']:
raise ValueError(f'Unsupported language: {language}')
options = list(score_map.keys())
get_score_template = get_template(f'{language}/get_score/plain_text.j2')
get_score_prompts = [
get_score_template.render({
'metric': metric_name,
'unstructured_assessment': unstructured_assessment,
'options': options,
}) if unstructured_assessment else None
for unstructured_assessment in unstructured_assessment_result
]
config = {"model": "claude-3-haiku-20240307", "max_tokens": 1024}
config.update(self._anthropic_args or {})
tqdm_description = tqdm_description or 'Scores (2/2)'
responses = self._call_api(prompts=get_score_prompts,
config=config,
tqdm_description=tqdm_description)
raw_response_texts = [
response.content[0].text if response else None
for response in responses
]
def _turn_to_score(response: str | None) -> float | None:
if response is None:
return None
option_found = [option for option in options if option in response]
# if response contains multiple options as substrings, return None
if len(option_found) != 1:
return None
return score_map[option_found[0]]
return [_turn_to_score(response) for response in raw_response_texts]
[docs] def similarity_scorer(self):
raise NotImplementedError(
'Embedding-based metrics are not supported in AnthropicEvalClient.'
'Use other EvalClients to get these metrics.')