Source code for langcheck.metrics.ja._tokenizers

from __future__ import annotations

import abc
from collections.abc import Iterator

from janome.tokenizer import Tokenizer
from rouge_score.tokenizers import Tokenizer as BaseTokenizer

# Japanese punctuation characters
_PUNCTUATIONS = ["、", "。", "，", "．", ",", ".", "?", "!", "？", "！"]


class _JapaneseTokenizer(BaseTokenizer):
    @abc.abstractmethod
    def _tokenize(self, text: str) -> Iterator[str]:
        raise NotImplementedError(
            "Tokenizer for Japanese must override `_tokenize()` method"
        )

    def tokenize(self, text: str) -> list[str]:
        tokens = self._tokenize(text)
        return [
            token for token in tokens if (token and token not in _PUNCTUATIONS)
        ]



[docs]
class MeCabTokenizer(_JapaneseTokenizer):
    """
    An optional tokenizer to calculate rouge score base on MeCab.

    .. note::
        The advantage of using MeCab is that the core implementation is written
        in a compiled language and runs much faster than Janome. If you are
        processing large data, consider setting up MeCab and using the
        :class:`~langcheck.metrics.ja.MeCabTokenizer`.
        On the other hand, it takes more effort to install it on some
        environments and may not work. Please refer to the
        `official page <https://taku910.github.io/mecab/>`_ if the
        Python wrapper, mecab-python3, does not work in your environment.
    """

    class _MeCabNodeSurfaceIterator(Iterator):
        def __init__(self, node) -> None:
            self._node = node
            # Skip BOS.
            if node.feature.startswith("BOS/EOS"):
                self._node = self._node.next

        def __next__(self):
            # Stop iteration when the node is EOS.
            if self._node.feature.startswith("BOS/EOS"):
                raise StopIteration

            node = self._node
            self._node = self._node.next
            return node.surface

    def __init__(self):
        try:
            # Ignore the missing imports error since MeCab is optional.
            import MeCab  # type: ignore[reportMissingImports]
        except ModuleNotFoundError:
            raise ModuleNotFoundError(
                "No module named 'MeCab'.\n"
                "Since 'MeCabTokenizer' is an optional feature, 'MeCab' "
                "is not installed by default along with langcheck. Please "
                "set up 'MeCab' on your own."
            )

        self.tokenizer = MeCab.Tagger()

    def _tokenize(self, text):
        return MeCabTokenizer._MeCabNodeSurfaceIterator(
            self.tokenizer.parseToNode(text)
        )




[docs]
class JanomeTokenizer(_JapaneseTokenizer):
    """Janome based Tokenizer for Japanese.

    The default tokenizer to calculate rouge score base on Janome.

    .. note::
        The advantage of using Janome is that it is a pure Python library and
        introduces no additional dependencies.
        On the other hand, it takes more time to parse sentences than a MeCab
        -based tokenizer. Specifically, it takes seconds every time when
        constructing this class since the Janome tokenizer loads the entire
        dictionary during initialization.
        If you are processing large data, consider setting up MeCab and using
        the :class:`~langcheck.metrics.ja.MeCabTokenizer`.
    """

    def __init__(self):
        self.tokenizer = Tokenizer()

    def _tokenize(self, text: str):
        return self.tokenizer.tokenize(text, wakati=True)