Source code for langcheck.augment.en._gender._gender
from __future__ import annotations
from collections.abc import Iterable
from random import choice
from types import MappingProxyType
import nltk
from nltk import pos_tag, word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from langcheck.augment.en._gender._gender_pronouns import (
_PRONOUNS_DICT,
_BaseGenderPronouns,
)
# This dictionary is used to determine the form of the pronoun.
# Note that his and hers are not included in this dictionary because they can be
# either of two different forms depending on the context.
_PRONOUNS_FORM_DICT = MappingProxyType(
{
"she": "subject",
"he": "subject",
"they": "subject",
"their": "dependent_possessive",
"him": "object",
"them": "object",
"hers": "independent_possessive",
"theirs": "independent_possessive",
"himself": "reflexive",
"herself": "reflexive",
"themselves": "reflexive",
}
)
def _get_pronoun_form(word: str, tag: str) -> str | None:
"""Get pronoun form of the word."""
# Handle degenerated cases.
if tag == "PRP$":
# PRP$ tag denotes a possessive pronoun.
return "dependent_possessive"
if word.lower() == "her":
return "object"
if word.lower() == "his":
return "independent_possessive"
return _PRONOUNS_FORM_DICT.get(word.lower())
def _replace_pronoun(word: str, tag: str, target_pronouns: _BaseGenderPronouns):
# When the word is not a pronoun, return the word itself.
if (pronoun_form := _get_pronoun_form(word, tag)) is None:
return word
# Replace the pronoun with the target pronoun with the same form.
replaced_pronoun = getattr(target_pronouns, pronoun_form)
if word.isupper():
return replaced_pronoun.upper()
elif word.istitle():
return replaced_pronoun.title()
# When the word is not first letter capitalized or uppercase only, return
# the word in lowercase, regardless of how irregularly the word is
# capitalized.
return replaced_pronoun
def _replace_gender_pronouns(
text: str,
target_pronouns: _BaseGenderPronouns,
) -> str:
"""Replace target pronouns in text with new pronouns.
Args:
target_pronouns (_BaseGenderPronouns): Pronouns to replace with.
text (str): Text to be augmented.
Returns:
str: Augmented text.
"""
try:
nltk.data.find("taggers/averaged_perceptron_tagger_eng")
except LookupError:
nltk.download("averaged_perceptron_tagger_eng")
try:
nltk.data.find("tokenizers/punkt_tab")
except LookupError:
nltk.download("punkt_tab")
tagged_words = pos_tag(word_tokenize(text))
augmented_words = [
_replace_pronoun(word, tag, target_pronouns)
for (word, tag) in tagged_words
]
return TreebankWordDetokenizer().detokenize(augmented_words)
[docs]
def gender(
texts: Iterable[str] | str,
*,
to_gender: str = "plural",
) -> list[str]:
"""Replace pronouns with that of specified gender.
Args:
texts: Iterable of texts to be augmented.
to_gender: Replacing pronoun type string ('male', 'female',
'neutral', or 'plural'). Default to `plural`.
Returns:
List of sentences with replaced pronouns.
.. note::
Replacing neopronouns with other neopronouns is not supported yet
because `NLTK <https://www.nltk.org/>`_ does not recognize them.
"""
if (to_gender is not None) and (
to_gender not in ["female", "male", "neutral", "plural"]
):
raise ValueError(
f"The argument 'gender' must be one of 'female', 'male', 'neutral',"
f" or 'plural', but got {to_gender}."
)
target_gender = choice(_PRONOUNS_DICT[to_gender])
if isinstance(texts, str):
return [_replace_gender_pronouns(texts, target_gender)]
elif isinstance(texts, Iterable):
return [_replace_gender_pronouns(text, target_gender) for text in texts]
else:
raise TypeError(
f"Expected texts to be a string or iterable of strings but got "
f"{type(texts)}."
)