Source code for langcheck.augment.en._to_full_width
from __future__ import annotations
import random
import jaconv
[docs]
def to_full_width(
instances: list[str] | str,
*,
aug_char_p: float = 1.0,
num_perturbations: int = 1,
seed: int | None = None,
) -> list[str]:
"""Applies a text perturbation to each string in instances (usually a list
of prompts) where some ascii characters are converted into full-width
characters defined in UTF-8.
Args:
instances: A single string or a list of strings to be augmented.
aug_char_p: Percentage of all characters that will be augmented.
num_perturbations: The number of perturbed instances to generate for
each string in instances.
seed: The seed for the random number generator. You can fix the seed to
deterministically choose which characters to change.
Returns:
A list of perturbed instances.
"""
# Validation on aug_char_p
if aug_char_p < 0 or aug_char_p > 1:
raise ValueError("aug_char_p must be between 0 and 1")
if seed is not None:
random.seed(seed)
instances = [instances] if isinstance(instances, str) else instances
perturbed_instances = []
for instance in instances:
for _ in range(num_perturbations):
perturbed_instance = ""
for char in instance:
if random.random() > aug_char_p:
perturbed_instance += char # No augmentation
else:
perturbed_instance += jaconv.h2z(
char, kana=False, ascii=True, digit=True
)
perturbed_instances.append(perturbed_instance)
return perturbed_instances