"""
Plane class, support chain function calls.
"""
from plane.func import PATTERNS, compile_regex
from plane.pattern import ASCII_WORD, Token
from plane.punctuation import punc
[docs]class Plane:
"""
Init :class:`Plane.text` and :class:`Plane.values` when the instance is
created.
"""
def __init__(self):
self._text = ""
self._values = []
@property
def text(self):
return self._text
@property
def values(self):
return self._values
[docs] def replace(self, regex, repl=None, result=False):
"""
:param Regex regex: :class:`Regex`
:param str repl: replacement for regex, if setted, default value will \
be overwritten
:param bool result: if `True`, return result directly
Replace matched :class:`regex` patterns with :class:`repl`.
"""
repl = repl if repl is not None else regex.repl
text, start = "", 0
for t in self.extract(regex, result=True):
text += self._text[start : t.start] + repl
start = t.end
text += self._text[start:]
if result:
return text
self._text = text
return self
[docs] def update(self, text):
"""
:param str text: text string.
Init `Plane.text` and `Plane.values`.
"""
if not isinstance(text, str):
raise TypeError("Only support string.")
self._text = text
self._values = []
return self
[docs] def segment(self, regex=ASCII_WORD):
"""
:param Regex regex: default regex is `ASCII_WORD`, this will keep all \
english words complete
Segment sentence.
Chinese words will be split into char and English words will be keeped.
"""
regex = PATTERNS.get(regex.name, compile_regex(regex))
result, start = [], 0
for t in regex.finditer(self._text):
result.extend(
[char for char in list(self._text[start : t.start()]) if char != " "]
)
result.append(self._text[t.start() : t.end()])
start = t.end()
result.extend([char for char in list(self._text[start:]) if char != " "])
return result
[docs] def remove_punctuation(self, repl=" ", punc=punc):
"""
:param str repl: replacement for regex, if setted, default value will \
be overwritten
remove all punctuations
"""
self._text = punc.remove(self._text, repl)
return self
[docs] def normalize_punctuation(self, punc=punc):
"""
normalize punctuations to English punctuations
"""
self._text = punc.normalize(self.text)
return self