Source code for langcheck.augment.ja._conv_kana
from __future__ import annotations
import random
import jaconv
[docs]
def conv_hiragana(
instances: list[str] | str,
convert_to: str = "kata",
*,
aug_char_p: float = 1.0,
num_perturbations: int = 1,
seed: int | None = None,
) -> list[str]:
"""Convert hiragana in the text to katakana or vice versa.
Args:
instances: A single string or a list of strings to be augmented.
convert_to: The target script to convert to. Available values are
- 'kata' for katakana
- 'hkata' for half-width katakana
- 'alpha' for alphabets
aug_char_p: Percentage of all characters that will be augmented.
num_perturbations: The number of perturbed instances to generate for
each string in instances.
seed: The seed for the random number generator. You can fix the seed to
deterministically choose which characters to change.
Returns:
A list of perturbed instances.
"""
# Validation on aug_char_p
if aug_char_p < 0 or aug_char_p > 1:
raise ValueError("aug_char_p must be between 0 and 1")
if seed is not None:
random.seed(seed)
instances = [instances] if isinstance(instances, str) else instances
perturbed_instances = []
for instance in instances:
for _ in range(num_perturbations):
perturbed_instance = ""
for char in instance:
if random.random() > aug_char_p:
perturbed_instance += char # No augmentation
else:
if convert_to == "kata":
perturbed_instance += jaconv.hira2kata(char)
elif convert_to == "hkata":
perturbed_instance += jaconv.hira2hkata(char)
elif convert_to == "alpha":
perturbed_instance += jaconv.kana2alphabet(char)
else:
raise ValueError(
"convert_to must be one of 'kata', 'hkata', or 'alpha'"
)
perturbed_instances.append(perturbed_instance)
return perturbed_instances