Source code for langcheck.augment.ja._synonym
from __future__ import annotations
import random
from contextlib import suppress
from chikkarpy import Chikkar
from chikkarpy.dictionarylib import Dictionary as chikkardict
with suppress(ImportError):
from sudachipy import Dictionary # type: ignore[reportMissingImports]
[docs]
def synonym(
instances: list[str] | str,
*,
num_perturbations: int = 1,
seed: int | None = None,
**kwargs,
) -> list[str]:
"""Applies a text perturbation to each string in instances (usually a list
of prompts) where some words are replaced with synonyms.
Args:
instances: A single string or a list of strings to be augmented.
num_perturbations: The number of perturbed instances to generate for
each string in instances
aug_p: Percentage of words with synonymous which will be augmented.
Defaults to `0.8`.
seed: The seed for the random number generator. You can fix the seed to
deterministically choose which words to change.
Returns:
A list of perturbed instances.
.. note::
This function requires `sudachidict_core` and `sudachipy` to be
installed in your environment.
Please refer to the `official instructions <https://github.com/
WorksApplications/SudachiPy?tab=readme-ov-file#setup>`_ to install them.
"""
if seed is not None:
random.seed(seed)
_SudachiDict = Dictionary() # type: ignore[reportUnboundVariable]
chikkar = Chikkar()
chikkar.add_dictionary(chikkardict())
sudachi_tokenizer = _SudachiDict.create()
kwargs["aug_p"] = kwargs.get("aug_p", 0.8)
instances = [instances] if isinstance(instances, str) else instances
perturbed_instances = []
for instance in instances:
tokens = sudachi_tokenizer.tokenize(instance)
for _ in range(num_perturbations):
perturbed_instance = ""
for token in tokens:
synonym = token.surface()
if (
synonyms := chikkar.find(token.normalized_form())
) and random.random() < kwargs["aug_p"]:
synonym = random.choice(synonyms)
perturbed_instance += synonym
perturbed_instances.append(perturbed_instance)
return perturbed_instances