Source code for langcheck.metrics.zh._tokenizers
from __future__ import annotations
import abc
from collections.abc import Iterator
import hanlp
from rouge_score.tokenizers import Tokenizer as BaseTokenizer
# size 43M+, fine grained tokenizer
DEFAULT_TOKENIZER_WEIGHT = hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH # type: ignore[reportGeneralTypeIssues] # noqa: E501
# Chinese puncuations list
# https://github.com/yikeke/zh-style-guide/blob/master/source/%E6%A0%87%E7%82%B9%E7%AC%A6%E5%8F%B7/%E5%B8%B8%E7%94%A8%E4%B8%AD%E6%96%87%E6%A0%87%E7%82%B9%E7%AC%A6%E5%8F%B7.md
_PUNCTUATIONS = [
"、",
",",
"。",
":",
";",
"?",
"!",
"?",
"!",
"~",
"-",
"—",
"——",
"……",
"⋯⋯",
"/",
]
class _ChineseTokenizer(BaseTokenizer):
@abc.abstractmethod
def _tokenize(self, text: str) -> Iterator[str]:
raise NotImplementedError(
"Tokenizer for Chinese must override `_tokenize()` method"
)
def tokenize(self, text: str) -> list[str]:
tokens = self._tokenize(text)
return [
token for token in tokens if (token and token not in _PUNCTUATIONS)
]
[docs]
class HanLPTokenizer(_ChineseTokenizer):
"""HanLP based Tokenizer for Chinese.
The default tokenizer to calculate rouge score based on HanLP.
.. note::
`HanLP <https://github.com/hankcs/HanLP/tree/doc-zh>`_ is an actively
maintained NLP library that was initially developed for Chinese
language processing.
We run HanLP's single-task models using HanLP's pipeline mode, because:
1. HanLP has both multi-task models and single-task models. The
multi-task models are quite large (generally 400MB+), whereas the
single-task models are only ~40MB. So, we use a single-task model by
default.
2. HanLP's pipeline mode allows processing of long texts (i.e. many
sentences) efficiently in parallel. It splits long text into sentences
and applies the tokenizer to the sentences in parallel.
"""
def __init__(self) -> None:
super().__init__()
tokenizer = hanlp.load(DEFAULT_TOKENIZER_WEIGHT)
self.tokenizer_pipeline = hanlp.pipeline().append(
hanlp.utils.rules.split_sentence # type: ignore[reportGeneralTypeIssues]
)
self.tokenizer_pipeline = self.tokenizer_pipeline.append(
tokenizer
).append(lambda sents: sum(sents, []))
def _tokenize(self, text: str) -> Iterator[str]:
tokens = self.tokenizer_pipeline(text)
return tokens # type: ignore[reportGeneralTypeIssues]