Source code for langcheck.metrics.de._tokenizers

from nltk.stem.cistem import Cistem
from nltk.tokenize import word_tokenize
from rouge_score.tokenizers import Tokenizer as BaseTokenizer


[docs] class DeTokenizer(BaseTokenizer): """Tokenizer for German. This tokenizer is used to calculate rouge score for German. """ def __init__(self, stemmer=False): self.stemmer = None if stemmer: self.stemmer = Cistem()
[docs] def tokenize(self, text: str) -> list[str]: if self.stemmer: # use only the stem part of the word text, _ = self.stemmer.segment(text) return word_tokenize(text)