Source code for plane.punctuation

import re
import sys
import unicodedata


[docs]class Punctuation: """All the punctuations in Unicode. Abbr. Description :: Pc - Punctuation, Connector Pd - Punctuation, Dash Ps - Punctuation, Open Pe - Punctuation, Close Pi - Punctuation, Initial quote (may behave like Ps or Pe) Pf - Punctuation, Final quote (may behave like Ps or Pe) Po - Punctuation, Other Some chars are not included in punctuations. Such as: `+`, `^`, `$`, `~`. You can use :class:`Plane.pattern` to process these chars. :param dict normalization: punctuation normalization map """ def __init__(self, normalization=None): self.repl = " " self.punc = None self.punc_map = {} self.normalizer = None self.normelization = normalization or { "`": "'", "''": '"', "„": '"', "–": "-", "—": " - ", "´": "'", "‚": '"', "´´": '"', "…": "...", # French quotes "«": '"', "»": '"', # Chinese ",": ",", "。": ".", "?": "?", "!": "!", ":": ":", "(": "(", ")": ")", "【": "(", "】": ")", "《": "(", "》": ")", "「": "(", "」": ")", "『": "(", "』": ")", "’": "'", "‘": "'", "“": '"', "”": '"', ";": ";", "〜": "~", } def get_punc_map(self, repl=" "): if not self.punc: self.punc = [ c for c in range(sys.maxunicode) if unicodedata.category(chr(c)).startswith("P") ] if repl not in self.punc_map: self.punc_map[repl] = dict(zip(self.punc, repl * len(self.punc))) return self.punc_map[repl]
[docs] def remove(self, text, repl=" "): """ :param str text: input text Remove all punctuations. This methods use :class:`unicodedata` (https://docs.python.org/3.6/library/unicodedata.html) to get all the punctuations. """ return text.translate(self.get_punc_map(repl))
[docs] def normalize(self, text): """ :param str text: input text Convert punctuations from other languages to English punctuations. Not every punctuation is included. - https://github.com/moses-smt/mosesdecoder/blob/master/scripts - http://xahlee.info/comp/unicode_punctuation_symbols.html - https://www.compart.com/en/unicode/category/Po """ if not self.normalizer: self.init_normalization() return self.normalizer.sub( lambda m: self.normelization[m.string[m.start() : m.end()]], text )
def init_normalization(self): if not self.normalizer: self.normalizer = re.compile( "({})".format("|".join(map(re.escape, self.normelization.keys()))) )
punc = Punctuation()