classes.c31_corpus
1from inspect import cleandoc 2from typing import Literal 3import re 4import colorama 5import nltk 6from nltk.corpus import stopwords 7from loguru import logger 8from icecream import ic 9 10from lib import helpers, content 11 12 13class KCorpus: 14 def __init__(self, data: str, limit=1000): 15 """ 16 Args: 17 data: Input data as a string or file path. 18 limit (optional): Limit for n-grams. 19 """ 20 21 if helpers.isFile(data): 22 data = helpers.readFile(data) 23 24 self.raw: str = data 25 """The original input data (contents of file if file path was given).""" 26 self.clean: str = self._toClean(self.raw) 27 """Cleaned version of the input data.""" 28 self.sentences: list[str] = self._toSentences(self.clean) 29 """List of sentences from the cleaned data.""" 30 self.lines: list[str] = self._toLines(self.clean) 31 """List of lines from the cleaned data.""" 32 self.tokens: list[str] = self._toTokens(self.clean) 33 """List of tokens from the cleaned data. May contain duplicates that are valuable in calculating nltk PMI.""" 34 self.words: list[str] = self._toWords(self.tokens) 35 """List of unique words from the cleaned data.""" 36 self.bigrams: list[str] = self._toNgrams(self.tokens, limit, n=2) 37 """List of word bigrams (2-grams).""" 38 self.trigrams: list[str] = self._toNgrams(self.tokens, limit, n=3) 39 """List of word trigrams (3-grams).""" 40 self.quadgrams: list[str] = self._toNgrams(self.tokens, limit, n=4) 41 """List of word quadgrams (4-grams).""" 42 43 def __str__(self) -> str: 44 """Returns a human-readable representation of the KCorpus object.""" 45 return cleandoc( 46 f""" 47 {colorama.Fore.BLACK}{colorama.Back.LIGHTGREEN_EX}KCorpus{colorama.Style.RESET_ALL} {len(self.tokens)} tokens {colorama.Fore.YELLOW}{len(self.words)} words {colorama.Fore.GREEN}{len(self.sentences)} sentences {colorama.Fore.CYAN}{len(self.lines)} lines {colorama.Fore.BLUE}{len(self.bigrams)} bigrams {colorama.Fore.MAGENTA}{len(self.trigrams)} trigrams {colorama.Fore.RED}{len(self.quadgrams)} quadgrams{colorama.Style.RESET_ALL} 48 """ 49 ) 50 51 # Internals 52 def _toClean(self, data: str) -> str: 53 """Clean and prettify the input data.""" 54 dataPretty = content.prettifyText(data) 55 dataOmitted = content.omitMissing(input=dataPretty, mode="words", debug=0) 56 57 return dataOmitted.strip() 58 59 def _toSentences(self, data: str) -> list[str]: 60 """Returns cleaned data split into sentences.""" 61 return content.splitStringToSentences(data) 62 63 def _toLines(self, data: str) -> list[str]: 64 """Returns cleaned data split into lines.""" 65 lines = data.split("\n") 66 lines = helpers.dedupe(lines) 67 return lines 68 69 def _toTokens(self, data: str) -> list[str]: 70 """Returns cleaned, filtered words.""" 71 72 def _cleanTokens(tokens): 73 removals = [ 74 # "-" 75 r"^\W+$", 76 # "- Hello" => "Hello" 77 # "Hello -" => "Hello" 78 r"^[.,']\s+|\s+[.,']$", 79 # "'s" => None 80 r"^\W+[A-Za-z]$", 81 ] 82 83 def cleanToken(token): 84 for removal in removals: 85 matches = re.compile(rf"{removal}").findall(token) 86 if matches: 87 before = token 88 for match in matches: 89 token = re.sub(rf"{removal}", "", token) 90 logger.trace( 91 "[Clean] {} \tfrom\t {} \t=> {}", 92 match, 93 before, 94 token or ("empty"), 95 ) 96 97 # Assume it’s not abbreviation => trim punctuation 98 if len(token) > 4: 99 token = token.strip(".,:;-") 100 101 return token 102 103 tokens = [cleanToken(token) for token in tokens] 104 return [token for token in tokens if token] # Remove None 105 106 stopWords = stopwords.words("english") 107 wordsAll = nltk.word_tokenize(data) 108 # ? Remove nltk stop words 109 wordsFiltered = [word for word in wordsAll if word not in stopWords] 110 # ? Sanitize with my own defined blacklist 111 wordsSanitized = content.sanitize(wordsFiltered) 112 # Skip dedupe: Might help in establishing PMI 113 wordsClean = _cleanTokens(wordsSanitized) 114 115 return wordsClean 116 117 def _toWords(self, tokens: list[str]) -> list[str]: 118 """ 119 Returns unique words from the cleaned data. 120 121 Args: 122 tokens: List of tokens to process. 123 """ 124 # ? Remove duplicates 125 words = helpers.dedupe(tokens) 126 return words 127 128 def _toNgrams( 129 self, 130 data: str, 131 limit: int, 132 n: Literal[2, 3, 4] = 4, 133 mode: Literal["quantity", "score"] = "quantity", 134 ): 135 """ 136 Output phrases (n words) 137 138 Args: 139 data: The input data as a list of words. 140 limit: Number of n-grams to return (quantity or score). 141 n: The size of the n-gram (2=bigram, 3=trigram, 4=quadgram). 142 mode: 'quantity' for top-N, 'score' for above a PMI score. 143 144 Returns: 145 List of n-gram phrases. 146 """ 147 if n == 3: 148 measures = nltk.collocations.TrigramAssocMeasures() 149 finder = nltk.TrigramCollocationFinder 150 elif n == 4: 151 measures = nltk.collocations.QuadgramAssocMeasures() 152 finder = nltk.QuadgramCollocationFinder 153 # Default: bigrams 154 else: 155 measures = nltk.collocations.BigramAssocMeasures() 156 finder = nltk.BigramCollocationFinder 157 158 finderInstance = finder.from_words(data) 159 160 if mode == "scored": 161 ngrams = finderInstance.above_score(measures.pmi, limit) 162 else: 163 """Default mode: quantity""" 164 ngrams = finderInstance.nbest(measures.pmi, limit) 165 # ("Cold", "War") => "Cold War" 166 ngrams = [" ".join(gram) for gram in ngrams] 167 168 # Optional: Trim punctuation 169 trimPunctuation = 0 170 if trimPunctuation: 171 ngrams = [n.strip(".,:;-") for n in ngrams] 172 173 return ngrams 174 175 def serveSentences( 176 self, 177 mode: Literal["whole", "separate", "connected"] = "separate", 178 limit: int = 100, 179 shuffle=False, 180 ) -> list[str]: 181 """ 182 Serve sentences from the corpus. 183 184 Args: 185 mode: Mode of sentence serving (see below). 186 limit: Limit for sentences or words depending on mode. 187 shuffle: Whether to shuffle sentences. 188 189 Modes: 190 - `whole` 191 - whole sentences in logical succession 192 - `limit` number of sentences 193 - `separate` 194 - Chop sentences one by one 195 - `limit` max words in a sentence 196 - `connected` 197 - Connect whole sentences 198 - `limit` max sentences in a block 199 200 Returns: 201 List of sentences or sentence blocks. 202 """ 203 sentences = ( 204 helpers.shuffleAtRandomSegment(self.sentences) 205 if shuffle 206 else self.sentences 207 ) 208 209 if mode == "whole": 210 sentences = sentences[:limit] 211 else: 212 sentences = content.chopList(sentences, limit, mode, shuffle=False) 213 214 return sentences 215 216 def serveLines(self, clamp: int = None) -> list[str]: 217 """ 218 Serve shuffled lines from the corpus, optionally clamped. 219 220 Args: 221 clamp: Maximum number of lines to return. 222 223 Returns: 224 List of lines. 225 """ 226 lines = content.chopList(self.lines, clamp, mode="separate", shuffle=True) 227 return lines 228 229 def servePhrases(self, n: str | list = [1, 2, 3, 4]) -> list[str]: 230 """ 231 Serve phrases of specified n-gram sizes. 232 233 Args: 234 n: Word counts to include (one or multiple) 235 - 1 = single words 236 - 2 = bigrams 237 - 3 = trigrams 238 - 4 = quadgrams 239 240 Returns: 241 List of phrases. 242 """ 243 ngrams = {1: self.words, 2: self.bigrams, 3: self.trigrams, 4: self.quadgrams} 244 245 n = helpers.coerceList(n) 246 phrases = helpers.flatten([ngrams.get(number) for number in n]) 247 phrases = helpers.dedupe(phrases) 248 return phrases
class
KCorpus:
14class KCorpus: 15 def __init__(self, data: str, limit=1000): 16 """ 17 Args: 18 data: Input data as a string or file path. 19 limit (optional): Limit for n-grams. 20 """ 21 22 if helpers.isFile(data): 23 data = helpers.readFile(data) 24 25 self.raw: str = data 26 """The original input data (contents of file if file path was given).""" 27 self.clean: str = self._toClean(self.raw) 28 """Cleaned version of the input data.""" 29 self.sentences: list[str] = self._toSentences(self.clean) 30 """List of sentences from the cleaned data.""" 31 self.lines: list[str] = self._toLines(self.clean) 32 """List of lines from the cleaned data.""" 33 self.tokens: list[str] = self._toTokens(self.clean) 34 """List of tokens from the cleaned data. May contain duplicates that are valuable in calculating nltk PMI.""" 35 self.words: list[str] = self._toWords(self.tokens) 36 """List of unique words from the cleaned data.""" 37 self.bigrams: list[str] = self._toNgrams(self.tokens, limit, n=2) 38 """List of word bigrams (2-grams).""" 39 self.trigrams: list[str] = self._toNgrams(self.tokens, limit, n=3) 40 """List of word trigrams (3-grams).""" 41 self.quadgrams: list[str] = self._toNgrams(self.tokens, limit, n=4) 42 """List of word quadgrams (4-grams).""" 43 44 def __str__(self) -> str: 45 """Returns a human-readable representation of the KCorpus object.""" 46 return cleandoc( 47 f""" 48 {colorama.Fore.BLACK}{colorama.Back.LIGHTGREEN_EX}KCorpus{colorama.Style.RESET_ALL} {len(self.tokens)} tokens {colorama.Fore.YELLOW}{len(self.words)} words {colorama.Fore.GREEN}{len(self.sentences)} sentences {colorama.Fore.CYAN}{len(self.lines)} lines {colorama.Fore.BLUE}{len(self.bigrams)} bigrams {colorama.Fore.MAGENTA}{len(self.trigrams)} trigrams {colorama.Fore.RED}{len(self.quadgrams)} quadgrams{colorama.Style.RESET_ALL} 49 """ 50 ) 51 52 # Internals 53 def _toClean(self, data: str) -> str: 54 """Clean and prettify the input data.""" 55 dataPretty = content.prettifyText(data) 56 dataOmitted = content.omitMissing(input=dataPretty, mode="words", debug=0) 57 58 return dataOmitted.strip() 59 60 def _toSentences(self, data: str) -> list[str]: 61 """Returns cleaned data split into sentences.""" 62 return content.splitStringToSentences(data) 63 64 def _toLines(self, data: str) -> list[str]: 65 """Returns cleaned data split into lines.""" 66 lines = data.split("\n") 67 lines = helpers.dedupe(lines) 68 return lines 69 70 def _toTokens(self, data: str) -> list[str]: 71 """Returns cleaned, filtered words.""" 72 73 def _cleanTokens(tokens): 74 removals = [ 75 # "-" 76 r"^\W+$", 77 # "- Hello" => "Hello" 78 # "Hello -" => "Hello" 79 r"^[.,']\s+|\s+[.,']$", 80 # "'s" => None 81 r"^\W+[A-Za-z]$", 82 ] 83 84 def cleanToken(token): 85 for removal in removals: 86 matches = re.compile(rf"{removal}").findall(token) 87 if matches: 88 before = token 89 for match in matches: 90 token = re.sub(rf"{removal}", "", token) 91 logger.trace( 92 "[Clean] {} \tfrom\t {} \t=> {}", 93 match, 94 before, 95 token or ("empty"), 96 ) 97 98 # Assume it’s not abbreviation => trim punctuation 99 if len(token) > 4: 100 token = token.strip(".,:;-") 101 102 return token 103 104 tokens = [cleanToken(token) for token in tokens] 105 return [token for token in tokens if token] # Remove None 106 107 stopWords = stopwords.words("english") 108 wordsAll = nltk.word_tokenize(data) 109 # ? Remove nltk stop words 110 wordsFiltered = [word for word in wordsAll if word not in stopWords] 111 # ? Sanitize with my own defined blacklist 112 wordsSanitized = content.sanitize(wordsFiltered) 113 # Skip dedupe: Might help in establishing PMI 114 wordsClean = _cleanTokens(wordsSanitized) 115 116 return wordsClean 117 118 def _toWords(self, tokens: list[str]) -> list[str]: 119 """ 120 Returns unique words from the cleaned data. 121 122 Args: 123 tokens: List of tokens to process. 124 """ 125 # ? Remove duplicates 126 words = helpers.dedupe(tokens) 127 return words 128 129 def _toNgrams( 130 self, 131 data: str, 132 limit: int, 133 n: Literal[2, 3, 4] = 4, 134 mode: Literal["quantity", "score"] = "quantity", 135 ): 136 """ 137 Output phrases (n words) 138 139 Args: 140 data: The input data as a list of words. 141 limit: Number of n-grams to return (quantity or score). 142 n: The size of the n-gram (2=bigram, 3=trigram, 4=quadgram). 143 mode: 'quantity' for top-N, 'score' for above a PMI score. 144 145 Returns: 146 List of n-gram phrases. 147 """ 148 if n == 3: 149 measures = nltk.collocations.TrigramAssocMeasures() 150 finder = nltk.TrigramCollocationFinder 151 elif n == 4: 152 measures = nltk.collocations.QuadgramAssocMeasures() 153 finder = nltk.QuadgramCollocationFinder 154 # Default: bigrams 155 else: 156 measures = nltk.collocations.BigramAssocMeasures() 157 finder = nltk.BigramCollocationFinder 158 159 finderInstance = finder.from_words(data) 160 161 if mode == "scored": 162 ngrams = finderInstance.above_score(measures.pmi, limit) 163 else: 164 """Default mode: quantity""" 165 ngrams = finderInstance.nbest(measures.pmi, limit) 166 # ("Cold", "War") => "Cold War" 167 ngrams = [" ".join(gram) for gram in ngrams] 168 169 # Optional: Trim punctuation 170 trimPunctuation = 0 171 if trimPunctuation: 172 ngrams = [n.strip(".,:;-") for n in ngrams] 173 174 return ngrams 175 176 def serveSentences( 177 self, 178 mode: Literal["whole", "separate", "connected"] = "separate", 179 limit: int = 100, 180 shuffle=False, 181 ) -> list[str]: 182 """ 183 Serve sentences from the corpus. 184 185 Args: 186 mode: Mode of sentence serving (see below). 187 limit: Limit for sentences or words depending on mode. 188 shuffle: Whether to shuffle sentences. 189 190 Modes: 191 - `whole` 192 - whole sentences in logical succession 193 - `limit` number of sentences 194 - `separate` 195 - Chop sentences one by one 196 - `limit` max words in a sentence 197 - `connected` 198 - Connect whole sentences 199 - `limit` max sentences in a block 200 201 Returns: 202 List of sentences or sentence blocks. 203 """ 204 sentences = ( 205 helpers.shuffleAtRandomSegment(self.sentences) 206 if shuffle 207 else self.sentences 208 ) 209 210 if mode == "whole": 211 sentences = sentences[:limit] 212 else: 213 sentences = content.chopList(sentences, limit, mode, shuffle=False) 214 215 return sentences 216 217 def serveLines(self, clamp: int = None) -> list[str]: 218 """ 219 Serve shuffled lines from the corpus, optionally clamped. 220 221 Args: 222 clamp: Maximum number of lines to return. 223 224 Returns: 225 List of lines. 226 """ 227 lines = content.chopList(self.lines, clamp, mode="separate", shuffle=True) 228 return lines 229 230 def servePhrases(self, n: str | list = [1, 2, 3, 4]) -> list[str]: 231 """ 232 Serve phrases of specified n-gram sizes. 233 234 Args: 235 n: Word counts to include (one or multiple) 236 - 1 = single words 237 - 2 = bigrams 238 - 3 = trigrams 239 - 4 = quadgrams 240 241 Returns: 242 List of phrases. 243 """ 244 ngrams = {1: self.words, 2: self.bigrams, 3: self.trigrams, 4: self.quadgrams} 245 246 n = helpers.coerceList(n) 247 phrases = helpers.flatten([ngrams.get(number) for number in n]) 248 phrases = helpers.dedupe(phrases) 249 return phrases
KCorpus(data: str, limit=1000)
15 def __init__(self, data: str, limit=1000): 16 """ 17 Args: 18 data: Input data as a string or file path. 19 limit (optional): Limit for n-grams. 20 """ 21 22 if helpers.isFile(data): 23 data = helpers.readFile(data) 24 25 self.raw: str = data 26 """The original input data (contents of file if file path was given).""" 27 self.clean: str = self._toClean(self.raw) 28 """Cleaned version of the input data.""" 29 self.sentences: list[str] = self._toSentences(self.clean) 30 """List of sentences from the cleaned data.""" 31 self.lines: list[str] = self._toLines(self.clean) 32 """List of lines from the cleaned data.""" 33 self.tokens: list[str] = self._toTokens(self.clean) 34 """List of tokens from the cleaned data. May contain duplicates that are valuable in calculating nltk PMI.""" 35 self.words: list[str] = self._toWords(self.tokens) 36 """List of unique words from the cleaned data.""" 37 self.bigrams: list[str] = self._toNgrams(self.tokens, limit, n=2) 38 """List of word bigrams (2-grams).""" 39 self.trigrams: list[str] = self._toNgrams(self.tokens, limit, n=3) 40 """List of word trigrams (3-grams).""" 41 self.quadgrams: list[str] = self._toNgrams(self.tokens, limit, n=4) 42 """List of word quadgrams (4-grams)."""
Arguments:
- data: Input data as a string or file path.
- limit (optional): Limit for n-grams.
tokens: list[str]
List of tokens from the cleaned data. May contain duplicates that are valuable in calculating nltk PMI.
def
serveSentences( self, mode: Literal['whole', 'separate', 'connected'] = 'separate', limit: int = 100, shuffle=False) -> list[str]:
176 def serveSentences( 177 self, 178 mode: Literal["whole", "separate", "connected"] = "separate", 179 limit: int = 100, 180 shuffle=False, 181 ) -> list[str]: 182 """ 183 Serve sentences from the corpus. 184 185 Args: 186 mode: Mode of sentence serving (see below). 187 limit: Limit for sentences or words depending on mode. 188 shuffle: Whether to shuffle sentences. 189 190 Modes: 191 - `whole` 192 - whole sentences in logical succession 193 - `limit` number of sentences 194 - `separate` 195 - Chop sentences one by one 196 - `limit` max words in a sentence 197 - `connected` 198 - Connect whole sentences 199 - `limit` max sentences in a block 200 201 Returns: 202 List of sentences or sentence blocks. 203 """ 204 sentences = ( 205 helpers.shuffleAtRandomSegment(self.sentences) 206 if shuffle 207 else self.sentences 208 ) 209 210 if mode == "whole": 211 sentences = sentences[:limit] 212 else: 213 sentences = content.chopList(sentences, limit, mode, shuffle=False) 214 215 return sentences
Serve sentences from the corpus.
Arguments:
- mode: Mode of sentence serving (see below).
- limit: Limit for sentences or words depending on mode.
- shuffle: Whether to shuffle sentences.
Modes:
whole- whole sentences in logical succession
limitnumber of sentences
separate- Chop sentences one by one
limitmax words in a sentence
connected- Connect whole sentences
limitmax sentences in a block
Returns:
List of sentences or sentence blocks.
def
serveLines(self, clamp: int = None) -> list[str]:
217 def serveLines(self, clamp: int = None) -> list[str]: 218 """ 219 Serve shuffled lines from the corpus, optionally clamped. 220 221 Args: 222 clamp: Maximum number of lines to return. 223 224 Returns: 225 List of lines. 226 """ 227 lines = content.chopList(self.lines, clamp, mode="separate", shuffle=True) 228 return lines
Serve shuffled lines from the corpus, optionally clamped.
Arguments:
- clamp: Maximum number of lines to return.
Returns:
List of lines.
def
servePhrases(self, n: str | list = [1, 2, 3, 4]) -> list[str]:
230 def servePhrases(self, n: str | list = [1, 2, 3, 4]) -> list[str]: 231 """ 232 Serve phrases of specified n-gram sizes. 233 234 Args: 235 n: Word counts to include (one or multiple) 236 - 1 = single words 237 - 2 = bigrams 238 - 3 = trigrams 239 - 4 = quadgrams 240 241 Returns: 242 List of phrases. 243 """ 244 ngrams = {1: self.words, 2: self.bigrams, 3: self.trigrams, 4: self.quadgrams} 245 246 n = helpers.coerceList(n) 247 phrases = helpers.flatten([ngrams.get(number) for number in n]) 248 phrases = helpers.dedupe(phrases) 249 return phrases
Serve phrases of specified n-gram sizes.
Arguments:
- n: Word counts to include (one or multiple)
- 1 = single words
- 2 = bigrams
- 3 = trigrams
- 4 = quadgrams
Returns:
List of phrases.