Spaces:
Sleeping
Sleeping
| import re | |
| import nltk | |
| from natasha import (Doc, MorphVocab, NamesExtractor, NewsEmbedding, | |
| NewsMorphTagger, NewsNERTagger, NewsSyntaxParser, | |
| Segmenter) | |
| from nltk.corpus import stopwords | |
| nltk.download('stopwords') | |
| class TextCleaner: | |
| def __init__(self, lemma: bool = True): | |
| self.lemma = lemma | |
| self.segmenter = Segmenter() | |
| self.morph_vocab = MorphVocab() | |
| emb = NewsEmbedding() | |
| self.morph_tagger = NewsMorphTagger(emb) | |
| syntax_parser = NewsSyntaxParser(emb) | |
| ner_tagger = NewsNERTagger(emb) | |
| names_extractor = NamesExtractor(self.morph_vocab) | |
| self.en_stops = stopwords.words('english') | |
| self.ru_stops = stopwords.words('russian') | |
| self.punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~''' | |
| self.words_pattern = '[а-я]+' | |
| def execute(self, text): | |
| text = self.text_preprocessing(text) | |
| if self.lemma: | |
| text = self.lemmatize(text) | |
| return text | |
| def text_preprocessing(self, data): | |
| data = " ".join(x.lower() for x in data.split()) | |
| data = data.replace('[^\w\s]', '') | |
| data = " ".join(x for x in data.split() | |
| if x not in self.ru_stops and x not in self.en_stops) | |
| for punc in self.punc: | |
| if punc in data: | |
| data = data.replace(punc, "") | |
| data = re.sub(' +', ' ', data) | |
| return " ".join( | |
| re.findall(self.words_pattern, data, flags=re.IGNORECASE)) | |
| def lemmatize(self, text): | |
| doc = Doc(text) | |
| doc.segment(self.segmenter) | |
| doc.tag_morph(self.morph_tagger) | |
| for token in doc.tokens: | |
| token.lemmatize(self.morph_vocab) | |
| tokens = [] | |
| for token in doc.tokens: | |
| tokens.append(token.lemma) | |
| return " ".join(tokens) | |