Source code for langcheck.augment.ja._synonym

from __future__ import annotations

import random
from contextlib import suppress

from chikkarpy import Chikkar
from chikkarpy.dictionarylib import Dictionary as chikkardict

with suppress(ImportError):
    from sudachipy import Dictionary  # type: ignore[reportMissingImports]



[docs]
def synonym(
    instances: list[str] | str,
    *,
    num_perturbations: int = 1,
    seed: int | None = None,
    **kwargs,
) -> list[str]:
    """Applies a text perturbation to each string in instances (usually a list
    of prompts) where some words are replaced with synonyms.

    Args:
        instances: A single string or a list of strings to be augmented.
        num_perturbations: The number of perturbed instances to generate for
            each string in instances
        aug_p: Percentage of words with synonymous which will be augmented.
            Defaults to `0.8`.
        seed: The seed for the random number generator. You can fix the seed to
            deterministically choose which words to change.

    Returns:
        A list of perturbed instances.


    .. note::
        This function requires `sudachidict_core` and `sudachipy` to be
        installed in your environment.
        Please refer to the `official instructions <https://github.com/
        WorksApplications/SudachiPy?tab=readme-ov-file#setup>`_ to install them.

    """

    if seed is not None:
        random.seed(seed)

    _SudachiDict = Dictionary()  # type: ignore[reportUnboundVariable]

    chikkar = Chikkar()
    chikkar.add_dictionary(chikkardict())
    sudachi_tokenizer = _SudachiDict.create()

    kwargs["aug_p"] = kwargs.get("aug_p", 0.8)

    instances = [instances] if isinstance(instances, str) else instances
    perturbed_instances = []
    for instance in instances:
        tokens = sudachi_tokenizer.tokenize(instance)
        for _ in range(num_perturbations):
            perturbed_instance = ""
            for token in tokens:
                synonym = token.surface()
                if (
                    synonyms := chikkar.find(token.normalized_form())
                ) and random.random() < kwargs["aug_p"]:
                    synonym = random.choice(synonyms)
                perturbed_instance += synonym
            perturbed_instances.append(perturbed_instance)
    return perturbed_instances