Source code for langcheck.metrics.ja._tokenizers

from __future__ import annotations

import abc
from collections.abc import Iterator

from janome.tokenizer import Tokenizer
from rouge_score.tokenizers import Tokenizer as BaseTokenizer

# Japanese punctuation characters
_PUNCTUATIONS = ["、", "。", ",", ".", ",", ".", "?", "!", "?", "!"]


class _JapaneseTokenizer(BaseTokenizer):

    @abc.abstractmethod
    def _tokenize(self, text: str) -> Iterator[str]:
        raise NotImplementedError(
            "Tokenizer for Japanese must override `_tokenize()` method")

    def tokenize(self, text: str) -> list[str]:
        tokens = self._tokenize(text)
        return [
            token for token in tokens if (token and token not in _PUNCTUATIONS)
        ]


[docs] class MeCabTokenizer(_JapaneseTokenizer): """ An optional tokenizer to calculate rouge score base on MeCab. .. note:: The advantage of using MeCab is that the core implementation is written in a compiled language and runs much faster than Janome. If you are processing large data, consider setting up MeCab and using the :class:`~langcheck.metrics.ja.MeCabTokenizer`. On the other hand, it takes more effort to install it on some environments and may not work. Please refer to the `official page <https://taku910.github.io/mecab/>`_ if the Python wrapper, mecab-python3, does not work in your environment. """ class _MeCabNodeSurfaceIterator(Iterator): def __init__(self, node) -> None: self._node = node # Skip BOS. if node.feature.startswith("BOS/EOS"): self._node = self._node.next def __next__(self): # Stop iteration when the node is EOS. if self._node.feature.startswith("BOS/EOS"): raise StopIteration node = self._node self._node = self._node.next return node.surface def __init__(self): try: # Ignore the missing imports error since MeCab is optional. import MeCab # type: ignore[reportMissingImports] except ModuleNotFoundError: raise ModuleNotFoundError( "No module named 'MeCab'.\n" "Since 'MeCabTokenizer' is an optional feature, 'MeCab' " "is not installed by default along with langcheck. Please " "set up 'MeCab' on your own.") self.tokenizer = MeCab.Tagger() def _tokenize(self, text): return MeCabTokenizer._MeCabNodeSurfaceIterator( self.tokenizer.parseToNode(text))
[docs] class JanomeTokenizer(_JapaneseTokenizer): """Janome based Tokenizer for Japanese. The default tokenizer to calculate rouge score base on Janome. .. note:: The advantage of using Janome is that it is a pure Python library and introduces no additional dependencies. On the other hand, it takes more time to parse sentences than a MeCab -based tokenizer. Specifically, it takes seconds every time when constructing this class since the Janome tokenizer loads the entire dictionary during initialization. If you are processing large data, consider setting up MeCab and using the :class:`~langcheck.metrics.ja.MeCabTokenizer`. """ def __init__(self): self.tokenizer = Tokenizer() def _tokenize(self, text: str): return self.tokenizer.tokenize(text, wakati=True)