Source code for langcheck.metrics.zh._tokenizers

from __future__ import annotations

import abc
from collections.abc import Iterator

import hanlp
from rouge_score.tokenizers import Tokenizer as BaseTokenizer

# size 43M+, fine grained tokenizer
DEFAULT_TOKENIZER_WEIGHT = hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH  # type: ignore[reportGeneralTypeIssues] # noqa: E501
# Chinese puncuations list
# https://github.com/yikeke/zh-style-guide/blob/master/source/%E6%A0%87%E7%82%B9%E7%AC%A6%E5%8F%B7/%E5%B8%B8%E7%94%A8%E4%B8%AD%E6%96%87%E6%A0%87%E7%82%B9%E7%AC%A6%E5%8F%B7.md
_PUNCTUATIONS = [
    "、",
    "，",
    "。",
    "：",
    "；",
    "?",
    "!",
    "？",
    "！",
    "～",
    "-",
    "—",
    "——",
    "……",
    "⋯⋯",
    "/",
]


class _ChineseTokenizer(BaseTokenizer):
    @abc.abstractmethod
    def _tokenize(self, text: str) -> Iterator[str]:
        raise NotImplementedError(
            "Tokenizer for Chinese must override `_tokenize()` method"
        )

    def tokenize(self, text: str) -> list[str]:
        tokens = self._tokenize(text)
        return [
            token for token in tokens if (token and token not in _PUNCTUATIONS)
        ]



[docs]
class HanLPTokenizer(_ChineseTokenizer):
    """HanLP based Tokenizer for Chinese.

    The default tokenizer to calculate rouge score based on HanLP.

    .. note::
        `HanLP <https://github.com/hankcs/HanLP/tree/doc-zh>`_ is an actively
        maintained NLP library that was initially developed for Chinese
        language processing.
        We run HanLP's single-task models using HanLP's pipeline mode, because:
        1. HanLP has both multi-task models and single-task models. The
        multi-task models are quite large (generally 400MB+), whereas the
        single-task models are only ~40MB. So, we use a single-task model by
        default.
        2. HanLP's pipeline mode allows processing of long texts (i.e. many
        sentences) efficiently in parallel. It splits long text into sentences
        and applies the tokenizer to the sentences in parallel.
    """

    def __init__(self) -> None:
        super().__init__()
        tokenizer = hanlp.load(DEFAULT_TOKENIZER_WEIGHT)
        self.tokenizer_pipeline = hanlp.pipeline().append(
            hanlp.utils.rules.split_sentence  # type: ignore[reportGeneralTypeIssues]
        )
        self.tokenizer_pipeline = self.tokenizer_pipeline.append(
            tokenizer
        ).append(lambda sents: sum(sents, []))

    def _tokenize(self, text: str) -> Iterator[str]:
        tokens = self.tokenizer_pipeline(text)
        return tokens  # type: ignore[reportGeneralTypeIssues]