classes.c31_corpus

  1from inspect import cleandoc
  2from typing import Literal
  3import re
  4import colorama
  5import nltk
  6from nltk.corpus import stopwords
  7from loguru import logger
  8from icecream import ic
  9
 10from lib import helpers, content
 11
 12
 13class KCorpus:
 14    def __init__(self, data: str, limit=1000):
 15        """
 16        Args:
 17            data: Input data as a string or file path.
 18            limit (optional): Limit for n-grams.
 19        """
 20
 21        if helpers.isFile(data):
 22            data = helpers.readFile(data)
 23
 24        self.raw: str = data
 25        """The original input data (contents of file if file path was given)."""
 26        self.clean: str = self._toClean(self.raw)
 27        """Cleaned version of the input data."""
 28        self.sentences: list[str] = self._toSentences(self.clean)
 29        """List of sentences from the cleaned data."""
 30        self.lines: list[str] = self._toLines(self.clean)
 31        """List of lines from the cleaned data."""
 32        self.tokens: list[str] = self._toTokens(self.clean)
 33        """List of tokens from the cleaned data. May contain duplicates that are valuable in calculating nltk PMI."""
 34        self.words: list[str] = self._toWords(self.tokens)
 35        """List of unique words from the cleaned data."""
 36        self.bigrams: list[str] = self._toNgrams(self.tokens, limit, n=2)
 37        """List of word bigrams (2-grams)."""
 38        self.trigrams: list[str] = self._toNgrams(self.tokens, limit, n=3)
 39        """List of word trigrams (3-grams)."""
 40        self.quadgrams: list[str] = self._toNgrams(self.tokens, limit, n=4)
 41        """List of word quadgrams (4-grams)."""
 42
 43    def __str__(self) -> str:
 44        """Returns a human-readable representation of the KCorpus object."""
 45        return cleandoc(
 46            f"""
 47            {colorama.Fore.BLACK}{colorama.Back.LIGHTGREEN_EX}KCorpus{colorama.Style.RESET_ALL} {len(self.tokens)} tokens {colorama.Fore.YELLOW}{len(self.words)} words {colorama.Fore.GREEN}{len(self.sentences)} sentences {colorama.Fore.CYAN}{len(self.lines)} lines {colorama.Fore.BLUE}{len(self.bigrams)} bigrams {colorama.Fore.MAGENTA}{len(self.trigrams)} trigrams {colorama.Fore.RED}{len(self.quadgrams)} quadgrams{colorama.Style.RESET_ALL}
 48            """
 49        )
 50
 51    # Internals
 52    def _toClean(self, data: str) -> str:
 53        """Clean and prettify the input data."""
 54        dataPretty = content.prettifyText(data)
 55        dataOmitted = content.omitMissing(input=dataPretty, mode="words", debug=0)
 56
 57        return dataOmitted.strip()
 58
 59    def _toSentences(self, data: str) -> list[str]:
 60        """Returns cleaned data split into sentences."""
 61        return content.splitStringToSentences(data)
 62
 63    def _toLines(self, data: str) -> list[str]:
 64        """Returns cleaned data split into lines."""
 65        lines = data.split("\n")
 66        lines = helpers.dedupe(lines)
 67        return lines
 68
 69    def _toTokens(self, data: str) -> list[str]:
 70        """Returns cleaned, filtered words."""
 71
 72        def _cleanTokens(tokens):
 73            removals = [
 74                # "-"
 75                r"^\W+$",
 76                # "- Hello" => "Hello"
 77                # "Hello -" => "Hello"
 78                r"^[.,']\s+|\s+[.,']$",
 79                # "'s" => None
 80                r"^\W+[A-Za-z]$",
 81            ]
 82
 83            def cleanToken(token):
 84                for removal in removals:
 85                    matches = re.compile(rf"{removal}").findall(token)
 86                    if matches:
 87                        before = token
 88                        for match in matches:
 89                            token = re.sub(rf"{removal}", "", token)
 90                            logger.trace(
 91                                "[Clean] {} \tfrom\t {} \t=> {}",
 92                                match,
 93                                before,
 94                                token or ("empty"),
 95                            )
 96
 97                # Assume it’s not abbreviation => trim punctuation
 98                if len(token) > 4:
 99                    token = token.strip(".,:;-")
100
101                return token
102
103            tokens = [cleanToken(token) for token in tokens]
104            return [token for token in tokens if token]  # Remove None
105
106        stopWords = stopwords.words("english")
107        wordsAll = nltk.word_tokenize(data)
108        # ? Remove nltk stop words
109        wordsFiltered = [word for word in wordsAll if word not in stopWords]
110        # ? Sanitize with my own defined blacklist
111        wordsSanitized = content.sanitize(wordsFiltered)
112        # Skip dedupe: Might help in establishing PMI
113        wordsClean = _cleanTokens(wordsSanitized)
114
115        return wordsClean
116
117    def _toWords(self, tokens: list[str]) -> list[str]:
118        """
119        Returns unique words from the cleaned data.
120
121        Args:
122            tokens: List of tokens to process.
123        """
124        # ? Remove duplicates
125        words = helpers.dedupe(tokens)
126        return words
127
128    def _toNgrams(
129        self,
130        data: str,
131        limit: int,
132        n: Literal[2, 3, 4] = 4,
133        mode: Literal["quantity", "score"] = "quantity",
134    ):
135        """
136        Output phrases (n words)
137
138        Args:
139            data: The input data as a list of words.
140            limit: Number of n-grams to return (quantity or score).
141            n: The size of the n-gram (2=bigram, 3=trigram, 4=quadgram).
142            mode: 'quantity' for top-N, 'score' for above a PMI score.
143
144        Returns:
145            List of n-gram phrases.
146        """
147        if n == 3:
148            measures = nltk.collocations.TrigramAssocMeasures()
149            finder = nltk.TrigramCollocationFinder
150        elif n == 4:
151            measures = nltk.collocations.QuadgramAssocMeasures()
152            finder = nltk.QuadgramCollocationFinder
153        # Default: bigrams
154        else:
155            measures = nltk.collocations.BigramAssocMeasures()
156            finder = nltk.BigramCollocationFinder
157
158        finderInstance = finder.from_words(data)
159
160        if mode == "scored":
161            ngrams = finderInstance.above_score(measures.pmi, limit)
162        else:
163            """Default mode: quantity"""
164            ngrams = finderInstance.nbest(measures.pmi, limit)
165        # ("Cold", "War") => "Cold War"
166        ngrams = [" ".join(gram) for gram in ngrams]
167
168        # Optional: Trim punctuation
169        trimPunctuation = 0
170        if trimPunctuation:
171            ngrams = [n.strip(".,:;-") for n in ngrams]
172
173        return ngrams
174
175    def serveSentences(
176        self,
177        mode: Literal["whole", "separate", "connected"] = "separate",
178        limit: int = 100,
179        shuffle=False,
180    ) -> list[str]:
181        """
182        Serve sentences from the corpus.
183
184        Args:
185            mode: Mode of sentence serving (see below).
186            limit: Limit for sentences or words depending on mode.
187            shuffle: Whether to shuffle sentences.
188
189        Modes:
190        - `whole`
191            - whole sentences in logical succession
192            - `limit` number of sentences
193        - `separate`
194            - Chop sentences one by one
195            - `limit` max words in a sentence
196        - `connected`
197            - Connect whole sentences
198            - `limit` max sentences in a block
199
200        Returns:
201            List of sentences or sentence blocks.
202        """
203        sentences = (
204            helpers.shuffleAtRandomSegment(self.sentences)
205            if shuffle
206            else self.sentences
207        )
208
209        if mode == "whole":
210            sentences = sentences[:limit]
211        else:
212            sentences = content.chopList(sentences, limit, mode, shuffle=False)
213
214        return sentences
215
216    def serveLines(self, clamp: int = None) -> list[str]:
217        """
218        Serve shuffled lines from the corpus, optionally clamped.
219
220        Args:
221            clamp: Maximum number of lines to return.
222
223        Returns:
224            List of lines.
225        """
226        lines = content.chopList(self.lines, clamp, mode="separate", shuffle=True)
227        return lines
228
229    def servePhrases(self, n: str | list = [1, 2, 3, 4]) -> list[str]:
230        """
231        Serve phrases of specified n-gram sizes.
232
233        Args:
234            n: Word counts to include (one or multiple)
235                - 1 = single words
236                - 2 = bigrams
237                - 3 = trigrams
238                - 4 = quadgrams
239
240        Returns:
241            List of phrases.
242        """
243        ngrams = {1: self.words, 2: self.bigrams, 3: self.trigrams, 4: self.quadgrams}
244
245        n = helpers.coerceList(n)
246        phrases = helpers.flatten([ngrams.get(number) for number in n])
247        phrases = helpers.dedupe(phrases)
248        return phrases
class KCorpus:
 14class KCorpus:
 15    def __init__(self, data: str, limit=1000):
 16        """
 17        Args:
 18            data: Input data as a string or file path.
 19            limit (optional): Limit for n-grams.
 20        """
 21
 22        if helpers.isFile(data):
 23            data = helpers.readFile(data)
 24
 25        self.raw: str = data
 26        """The original input data (contents of file if file path was given)."""
 27        self.clean: str = self._toClean(self.raw)
 28        """Cleaned version of the input data."""
 29        self.sentences: list[str] = self._toSentences(self.clean)
 30        """List of sentences from the cleaned data."""
 31        self.lines: list[str] = self._toLines(self.clean)
 32        """List of lines from the cleaned data."""
 33        self.tokens: list[str] = self._toTokens(self.clean)
 34        """List of tokens from the cleaned data. May contain duplicates that are valuable in calculating nltk PMI."""
 35        self.words: list[str] = self._toWords(self.tokens)
 36        """List of unique words from the cleaned data."""
 37        self.bigrams: list[str] = self._toNgrams(self.tokens, limit, n=2)
 38        """List of word bigrams (2-grams)."""
 39        self.trigrams: list[str] = self._toNgrams(self.tokens, limit, n=3)
 40        """List of word trigrams (3-grams)."""
 41        self.quadgrams: list[str] = self._toNgrams(self.tokens, limit, n=4)
 42        """List of word quadgrams (4-grams)."""
 43
 44    def __str__(self) -> str:
 45        """Returns a human-readable representation of the KCorpus object."""
 46        return cleandoc(
 47            f"""
 48            {colorama.Fore.BLACK}{colorama.Back.LIGHTGREEN_EX}KCorpus{colorama.Style.RESET_ALL} {len(self.tokens)} tokens {colorama.Fore.YELLOW}{len(self.words)} words {colorama.Fore.GREEN}{len(self.sentences)} sentences {colorama.Fore.CYAN}{len(self.lines)} lines {colorama.Fore.BLUE}{len(self.bigrams)} bigrams {colorama.Fore.MAGENTA}{len(self.trigrams)} trigrams {colorama.Fore.RED}{len(self.quadgrams)} quadgrams{colorama.Style.RESET_ALL}
 49            """
 50        )
 51
 52    # Internals
 53    def _toClean(self, data: str) -> str:
 54        """Clean and prettify the input data."""
 55        dataPretty = content.prettifyText(data)
 56        dataOmitted = content.omitMissing(input=dataPretty, mode="words", debug=0)
 57
 58        return dataOmitted.strip()
 59
 60    def _toSentences(self, data: str) -> list[str]:
 61        """Returns cleaned data split into sentences."""
 62        return content.splitStringToSentences(data)
 63
 64    def _toLines(self, data: str) -> list[str]:
 65        """Returns cleaned data split into lines."""
 66        lines = data.split("\n")
 67        lines = helpers.dedupe(lines)
 68        return lines
 69
 70    def _toTokens(self, data: str) -> list[str]:
 71        """Returns cleaned, filtered words."""
 72
 73        def _cleanTokens(tokens):
 74            removals = [
 75                # "-"
 76                r"^\W+$",
 77                # "- Hello" => "Hello"
 78                # "Hello -" => "Hello"
 79                r"^[.,']\s+|\s+[.,']$",
 80                # "'s" => None
 81                r"^\W+[A-Za-z]$",
 82            ]
 83
 84            def cleanToken(token):
 85                for removal in removals:
 86                    matches = re.compile(rf"{removal}").findall(token)
 87                    if matches:
 88                        before = token
 89                        for match in matches:
 90                            token = re.sub(rf"{removal}", "", token)
 91                            logger.trace(
 92                                "[Clean] {} \tfrom\t {} \t=> {}",
 93                                match,
 94                                before,
 95                                token or ("empty"),
 96                            )
 97
 98                # Assume it’s not abbreviation => trim punctuation
 99                if len(token) > 4:
100                    token = token.strip(".,:;-")
101
102                return token
103
104            tokens = [cleanToken(token) for token in tokens]
105            return [token for token in tokens if token]  # Remove None
106
107        stopWords = stopwords.words("english")
108        wordsAll = nltk.word_tokenize(data)
109        # ? Remove nltk stop words
110        wordsFiltered = [word for word in wordsAll if word not in stopWords]
111        # ? Sanitize with my own defined blacklist
112        wordsSanitized = content.sanitize(wordsFiltered)
113        # Skip dedupe: Might help in establishing PMI
114        wordsClean = _cleanTokens(wordsSanitized)
115
116        return wordsClean
117
118    def _toWords(self, tokens: list[str]) -> list[str]:
119        """
120        Returns unique words from the cleaned data.
121
122        Args:
123            tokens: List of tokens to process.
124        """
125        # ? Remove duplicates
126        words = helpers.dedupe(tokens)
127        return words
128
129    def _toNgrams(
130        self,
131        data: str,
132        limit: int,
133        n: Literal[2, 3, 4] = 4,
134        mode: Literal["quantity", "score"] = "quantity",
135    ):
136        """
137        Output phrases (n words)
138
139        Args:
140            data: The input data as a list of words.
141            limit: Number of n-grams to return (quantity or score).
142            n: The size of the n-gram (2=bigram, 3=trigram, 4=quadgram).
143            mode: 'quantity' for top-N, 'score' for above a PMI score.
144
145        Returns:
146            List of n-gram phrases.
147        """
148        if n == 3:
149            measures = nltk.collocations.TrigramAssocMeasures()
150            finder = nltk.TrigramCollocationFinder
151        elif n == 4:
152            measures = nltk.collocations.QuadgramAssocMeasures()
153            finder = nltk.QuadgramCollocationFinder
154        # Default: bigrams
155        else:
156            measures = nltk.collocations.BigramAssocMeasures()
157            finder = nltk.BigramCollocationFinder
158
159        finderInstance = finder.from_words(data)
160
161        if mode == "scored":
162            ngrams = finderInstance.above_score(measures.pmi, limit)
163        else:
164            """Default mode: quantity"""
165            ngrams = finderInstance.nbest(measures.pmi, limit)
166        # ("Cold", "War") => "Cold War"
167        ngrams = [" ".join(gram) for gram in ngrams]
168
169        # Optional: Trim punctuation
170        trimPunctuation = 0
171        if trimPunctuation:
172            ngrams = [n.strip(".,:;-") for n in ngrams]
173
174        return ngrams
175
176    def serveSentences(
177        self,
178        mode: Literal["whole", "separate", "connected"] = "separate",
179        limit: int = 100,
180        shuffle=False,
181    ) -> list[str]:
182        """
183        Serve sentences from the corpus.
184
185        Args:
186            mode: Mode of sentence serving (see below).
187            limit: Limit for sentences or words depending on mode.
188            shuffle: Whether to shuffle sentences.
189
190        Modes:
191        - `whole`
192            - whole sentences in logical succession
193            - `limit` number of sentences
194        - `separate`
195            - Chop sentences one by one
196            - `limit` max words in a sentence
197        - `connected`
198            - Connect whole sentences
199            - `limit` max sentences in a block
200
201        Returns:
202            List of sentences or sentence blocks.
203        """
204        sentences = (
205            helpers.shuffleAtRandomSegment(self.sentences)
206            if shuffle
207            else self.sentences
208        )
209
210        if mode == "whole":
211            sentences = sentences[:limit]
212        else:
213            sentences = content.chopList(sentences, limit, mode, shuffle=False)
214
215        return sentences
216
217    def serveLines(self, clamp: int = None) -> list[str]:
218        """
219        Serve shuffled lines from the corpus, optionally clamped.
220
221        Args:
222            clamp: Maximum number of lines to return.
223
224        Returns:
225            List of lines.
226        """
227        lines = content.chopList(self.lines, clamp, mode="separate", shuffle=True)
228        return lines
229
230    def servePhrases(self, n: str | list = [1, 2, 3, 4]) -> list[str]:
231        """
232        Serve phrases of specified n-gram sizes.
233
234        Args:
235            n: Word counts to include (one or multiple)
236                - 1 = single words
237                - 2 = bigrams
238                - 3 = trigrams
239                - 4 = quadgrams
240
241        Returns:
242            List of phrases.
243        """
244        ngrams = {1: self.words, 2: self.bigrams, 3: self.trigrams, 4: self.quadgrams}
245
246        n = helpers.coerceList(n)
247        phrases = helpers.flatten([ngrams.get(number) for number in n])
248        phrases = helpers.dedupe(phrases)
249        return phrases
KCorpus(data: str, limit=1000)
15    def __init__(self, data: str, limit=1000):
16        """
17        Args:
18            data: Input data as a string or file path.
19            limit (optional): Limit for n-grams.
20        """
21
22        if helpers.isFile(data):
23            data = helpers.readFile(data)
24
25        self.raw: str = data
26        """The original input data (contents of file if file path was given)."""
27        self.clean: str = self._toClean(self.raw)
28        """Cleaned version of the input data."""
29        self.sentences: list[str] = self._toSentences(self.clean)
30        """List of sentences from the cleaned data."""
31        self.lines: list[str] = self._toLines(self.clean)
32        """List of lines from the cleaned data."""
33        self.tokens: list[str] = self._toTokens(self.clean)
34        """List of tokens from the cleaned data. May contain duplicates that are valuable in calculating nltk PMI."""
35        self.words: list[str] = self._toWords(self.tokens)
36        """List of unique words from the cleaned data."""
37        self.bigrams: list[str] = self._toNgrams(self.tokens, limit, n=2)
38        """List of word bigrams (2-grams)."""
39        self.trigrams: list[str] = self._toNgrams(self.tokens, limit, n=3)
40        """List of word trigrams (3-grams)."""
41        self.quadgrams: list[str] = self._toNgrams(self.tokens, limit, n=4)
42        """List of word quadgrams (4-grams)."""
Arguments:
  • data: Input data as a string or file path.
  • limit (optional): Limit for n-grams.
raw: str

The original input data (contents of file if file path was given).

clean: str

Cleaned version of the input data.

sentences: list[str]

List of sentences from the cleaned data.

lines: list[str]

List of lines from the cleaned data.

tokens: list[str]

List of tokens from the cleaned data. May contain duplicates that are valuable in calculating nltk PMI.

words: list[str]

List of unique words from the cleaned data.

bigrams: list[str]

List of word bigrams (2-grams).

trigrams: list[str]

List of word trigrams (3-grams).

quadgrams: list[str]

List of word quadgrams (4-grams).

def serveSentences( self, mode: Literal['whole', 'separate', 'connected'] = 'separate', limit: int = 100, shuffle=False) -> list[str]:
176    def serveSentences(
177        self,
178        mode: Literal["whole", "separate", "connected"] = "separate",
179        limit: int = 100,
180        shuffle=False,
181    ) -> list[str]:
182        """
183        Serve sentences from the corpus.
184
185        Args:
186            mode: Mode of sentence serving (see below).
187            limit: Limit for sentences or words depending on mode.
188            shuffle: Whether to shuffle sentences.
189
190        Modes:
191        - `whole`
192            - whole sentences in logical succession
193            - `limit` number of sentences
194        - `separate`
195            - Chop sentences one by one
196            - `limit` max words in a sentence
197        - `connected`
198            - Connect whole sentences
199            - `limit` max sentences in a block
200
201        Returns:
202            List of sentences or sentence blocks.
203        """
204        sentences = (
205            helpers.shuffleAtRandomSegment(self.sentences)
206            if shuffle
207            else self.sentences
208        )
209
210        if mode == "whole":
211            sentences = sentences[:limit]
212        else:
213            sentences = content.chopList(sentences, limit, mode, shuffle=False)
214
215        return sentences

Serve sentences from the corpus.

Arguments:
  • mode: Mode of sentence serving (see below).
  • limit: Limit for sentences or words depending on mode.
  • shuffle: Whether to shuffle sentences.

Modes:

  • whole
    • whole sentences in logical succession
    • limit number of sentences
  • separate
    • Chop sentences one by one
    • limit max words in a sentence
  • connected
    • Connect whole sentences
    • limit max sentences in a block
Returns:

List of sentences or sentence blocks.

def serveLines(self, clamp: int = None) -> list[str]:
217    def serveLines(self, clamp: int = None) -> list[str]:
218        """
219        Serve shuffled lines from the corpus, optionally clamped.
220
221        Args:
222            clamp: Maximum number of lines to return.
223
224        Returns:
225            List of lines.
226        """
227        lines = content.chopList(self.lines, clamp, mode="separate", shuffle=True)
228        return lines

Serve shuffled lines from the corpus, optionally clamped.

Arguments:
  • clamp: Maximum number of lines to return.
Returns:

List of lines.

def servePhrases(self, n: str | list = [1, 2, 3, 4]) -> list[str]:
230    def servePhrases(self, n: str | list = [1, 2, 3, 4]) -> list[str]:
231        """
232        Serve phrases of specified n-gram sizes.
233
234        Args:
235            n: Word counts to include (one or multiple)
236                - 1 = single words
237                - 2 = bigrams
238                - 3 = trigrams
239                - 4 = quadgrams
240
241        Returns:
242            List of phrases.
243        """
244        ngrams = {1: self.words, 2: self.bigrams, 3: self.trigrams, 4: self.quadgrams}
245
246        n = helpers.coerceList(n)
247        phrases = helpers.flatten([ngrams.get(number) for number in n])
248        phrases = helpers.dedupe(phrases)
249        return phrases

Serve phrases of specified n-gram sizes.

Arguments:
  • n: Word counts to include (one or multiple)
    • 1 = single words
    • 2 = bigrams
    • 3 = trigrams
    • 4 = quadgrams
Returns:

List of phrases.