classes.c34_text_cleaner

  1import re
  2from typing import Literal, Union
  3from loguru import logger
  4from icecream import ic
  5
  6from lib import content
  7
  8
  9class KTextCleaner:
 10    FORBIDDEN_TERMS_PATH = "/Users/christianjansky/Library/CloudStorage/Dropbox/KOMETA-Draw/01 Content/forbidden-any.txt"
 11
 12    def __init__(self, text: str | list[str] = None):
 13        """Initialize and optionally set text to be cleaned (will return self for chaining)."""
 14        self.text = text
 15
 16    def _getForbiddenTerms(self, extendTerms: list[str] = None) -> list[str]:
 17        """Load forbidden terms from default file and optionally extend the list."""
 18        with open(self.FORBIDDEN_TERMS_PATH, encoding="utf-8") as f:
 19            terms = f.read().splitlines()
 20
 21        if extendTerms:
 22            terms.extend(extendTerms)
 23
 24        cleanTerms = []
 25        for term in terms:
 26            cleaned = term.strip().lower()
 27            if cleaned and cleaned not in cleanTerms:
 28                cleanTerms.append(cleaned)
 29
 30        return cleanTerms
 31
 32    def get(self, newText: str | list[str] = None) -> str | list[str]:
 33        """Return the current text or set a new text."""
 34        if newText is not None:
 35            self.text = newText
 36        if self.text is None:
 37            raise ValueError("No text available for cleaning.")
 38        return self.text
 39
 40    def cleanWikipedia(self, input: str = None) -> Union[str, "KTextCleaner"]:
 41        """Clean Wikipedia-specific markup from text."""
 42        txt = self.get(input)
 43
 44        replacements = [
 45            # Add space when citation is directly followed by lowercase letter
 46            (r"(\[\d+(?::?\d+(?:[-–]\d+)?)?\])([a-z])", r"\1 \2"),
 47            (r"(\[(?:[Nn]ote|[Nn]?[Bb]|n)\s+\d+\])([a-z])", r"\1 \2"),
 48            (r"(\[[a-z]\])([a-z])", r"\1 \2"),
 49            # Replace section headers like == Header == with just "Header."
 50            (r"^={2,}\s*(.*?)\s*={2,}$", r"\1."),
 51        ]
 52        for pattern, replacement in replacements:
 53            txt = re.sub(
 54                pattern, replacement, txt, flags=re.MULTILINE
 55            )  # MULTILINE for ^ $ to work line-wise
 56
 57        removals = [
 58            # Remove citation references preceeded by non-whitespace: [1][2], [3]:15
 59            r"(?<=\S)\[\d+\](?::?\d+(?:[-–]\d+)?)?",
 60            # Remove note references: [note 1], [n 1], [nb 1], [NB 1], [Note 1]
 61            r"\[(?:[Nn]ote|[Nn]?[Bb]|n)\s+\d+\]",
 62            # Remove single letter notes: [a], [b], etc.
 63            r"(?<=\S)\[[a-z]\]",
 64            # Remove editorial tags
 65            r"(\{{2}|\[)[Cc]itation( needed)?(\}{2}|\])",
 66            r"(\{{2}|\[)[Cc]larification( needed)?(\}{2}|\])",
 67            r"(\{{2}|\[)[Vv]erification( needed)?(\}{2}|\])",
 68            r"(\{{2}|\[)[Cc]ite [Bb]ook(\}{2}|\])",
 69            # Remove IPA pronunciations only (avoid removing regular word/word slashes)
 70            # Also consumes optional surrounding punctuation spacing (e.g. "; /ræm/").
 71            r"(?:\s*[;,]\s*)?/(?=[^/\n]{1,80}/)(?=[^/\n]*[ˈˌːɪʊəɛæɑɔʃʒθðŋɡɹɾʔ])[^/\n]{1,80}/(?:\s*[,;])?",
 72        ]
 73        for removal in removals:
 74            txt = re.sub(rf"{removal}", "", txt)
 75
 76        # Minimal cleanup for whitespace artifacts introduced by removals above.
 77        cleanupReplacements = [
 78            # Prevent leading whitespace right after opening punctuation, e.g. "( born"
 79            (r"([\(\[\{])\s+", r"\1"),
 80            # Remove whitespace before punctuation introduced by token deletion
 81            (r"\s+([,.;:!?])", r"\1"),
 82            # Collapse repeated horizontal spaces introduced by removals
 83            (r"[ \t]{2,}", " "),
 84        ]
 85        for pattern, replacement in cleanupReplacements:
 86            txt = re.sub(pattern, replacement, txt)
 87        self.text = txt.strip()
 88        # ! input = '' is falsy and would return self instead of cleaned text, so check explicitly for None
 89        # ? May return empty string ''
 90        return self.text if input is not None else self
 91
 92    def removeLaTeX(self, input: str = None) -> Union[str, "KTextCleaner"]:
 93        """Remove LaTeX math markup and remnants while preserving prose."""
 94        txt = self.get(input)
 95
 96        def _dropStyledMathBlocks(value: str) -> str:
 97            # Remove complete style wrappers such as {\displaystyle ...} / {\textstyle ...}
 98            stylePattern = re.compile(r"\{\\(?:display|text)style\b")
 99
100            while True:
101                match = stylePattern.search(value)
102                if not match:
103                    break
104
105                start = match.start()
106                i = match.end()
107                depth = 1
108
109                while i < len(value) and depth > 0:
110                    char = value[i]
111                    if char == "{":
112                        depth += 1
113                    elif char == "}":
114                        depth -= 1
115                    i += 1
116
117                if depth == 0:
118                    value = value[:start] + " " + value[i:]
119                else:
120                    # Truncated/malformed block: remove to nearest sentence break.
121                    sentenceBreak = re.search(r"[.!?\n]", value[start:])
122                    if sentenceBreak:
123                        end = start + sentenceBreak.start()
124                        value = value[:start] + " " + value[end:]
125                    else:
126                        value = value[:start]
127                        break
128
129            return value
130
131        def _dropLatexCommandRuns(value: str) -> str:
132            # Remove command groups with braced arguments (including nested via iteration).
133            previous = None
134            while previous != value:
135                previous = value
136                value = re.sub(r"\\[A-Za-z]+\*?\s*\{[^{}]*\}", " ", value)
137
138            # Remove heavy command runs likely to be formula payloads.
139            commandRun = re.compile(
140                r"(?:\\(?:frac|sqrt|text|mathrm|mathbf|hat|vec|cdot|times|int|sum|prod|partial|nabla|left|right|lVert|rVert|alpha|beta|gamma|delta|epsilon|theta|lambda|mu|nu|pi|rho|sigma|tau|phi|psi|omega|Delta|Omega|to|propto|leq|geq|neq|approx|pm|mp|infty|sin|cos|tan|log|ln)\b"
141                r"|[_^]\{[^{}]*\}"
142                r"|[_^][A-Za-z0-9])"
143                r"(?:[\s{}()\[\],.=:+\-*/<>|]|\\[A-Za-z]+|\d)*"
144            )
145            value = commandRun.sub(" ", value)
146
147            # Remove any leftover escaped commands.
148            value = re.sub(r"\\[A-Za-z]+\*?", " ", value)
149            return value
150
151        def _dropFormulaLikeBraces(value: str) -> str:
152            # Remove brace blocks with formula-like symbols/structure.
153            braceFormula = re.compile(r"\{[^{}]*(?:[_^=]|\\|\d\s*[+\-*/=])[^{}]*\}")
154
155            previous = None
156            while previous != value:
157                previous = value
158                value = braceFormula.sub(" ", value)
159
160            return value
161
162        txt = _dropStyledMathBlocks(txt)
163        txt = _dropLatexCommandRuns(txt)
164        txt = _dropFormulaLikeBraces(txt)
165
166        # Remove obvious punctuation remnants from stripped math chunks.
167        txt = re.sub(r"\(\s*[;,:\-–—]*\s*\)", "", txt)
168        txt = re.sub(r"\[\s*[;,:\-–—]*\s*\]", "", txt)
169        txt = re.sub(r"\{\s*[;,:\-–—]*\s*\}", "", txt)
170        txt = re.sub(r"[{}]", " ", txt)
171        txt = re.sub(r"\s+([.,;:!?])", r"\1", txt)
172        txt = re.sub(r"([.,;:!?]){2,}", r"\1", txt)
173        txt = re.sub(r"[ \t]*\n[ \t]*", "\n", txt)
174        txt = re.sub(r"[ \t]{2,}", " ", txt)
175        txt = re.sub(r"\n{3,}", "\n\n", txt)
176
177        self.text = txt.strip()
178        return self.text if input is not None else self
179
180    def cleanPunctuation(self, input: str = None) -> Union[str, "KTextCleaner"]:
181        """Remove empty parentheses, brackets, and braces; condense multiple punctuation marks."""
182        txt = self.get(input)
183
184        replacements = [
185            # Remove empty parentheses, brackets, and braces
186            (r"[\(\[\{]\s*[\)\]\}]", ""),
187            # Replace multiple punctuation marks with a single one
188            (r"([.,;:!?]){2,}", r"\1"),
189        ]
190        for pattern, replacement in replacements:
191            txt = re.sub(pattern, replacement, txt)
192
193        self.text = txt.strip()
194        return self.text if input is not None else self
195
196    def _smartDoubleQuotes(self, txt: str) -> str:
197        """Convert dumb double quotes to smart quotes with basic nesting support."""
198        opening_context = set(" \t\n\r([{-–—/\\“‘«")
199        result = []
200        length = len(txt)
201
202        for i, char in enumerate(txt):
203            if char != '"':
204                result.append(char)
205                continue
206
207            prev_char = txt[i - 1] if i > 0 else ""
208            next_char = txt[i + 1] if i + 1 < length else ""
209
210            # Keep measurement marks like 11" unchanged.
211            if prev_char.isdigit():
212                result.append(char)
213                continue
214
215            is_opening = (
216                i == 0
217                or prev_char in opening_context
218                or (prev_char in ":;,.!?" and next_char and next_char.isalpha())
219            )
220            result.append("“" if is_opening else "”")
221
222        return "".join(result)
223
224    def improvePunctuation(self, input: str = None) -> Union[str, "KTextCleaner"]:
225        """Convert dumb quotes to smart quotes."""
226        txt = self.get(input)
227
228        replacements = [
229            # Apostrophes between letters (e.g., Wet'n'Wild, O'Neill)
230            (r"(?<=[A-Za-z])'(?=[A-Za-z])", "’"),
231            # Apostrophes in contractions and possessives
232            (r"\b([A-Za-z]+)'([A-Za-z]*)\b", r"\1’\2"),
233            # Single quotes (not measurements like 5', not after digits) for quotes around words (at least 2 characters to avoid contractions like ’n’)
234            (r"(?<!\w)(?<!\d)'([^']{2,}?)'(?!\d)", r"‘\1’"),
235            # Single quote used as apostrophe for s (e.g., Chris')
236            (r"([A-Za-z])'(\s)\b", r"\1’\2"),
237            # Replace hyphen with en-dash for number ranges (years in parens, dates, etc - not math)
238            (r"(\d{4})\s*-\s*(\d{4})", r"\1–\2"),
239            # Abbreviate shortened year NOT preceded by digit with apostrophe, e.g., '95 or '60s
240            (r"(?<!\d)'(\d{2})(s?)\b", r"’\1\2"),
241            # Add apostrophe for word contractions: 'tis -> ’tis
242            (r"(?<!\w)'([A-Za-z]+)\b", r"’\1"),
243            # Convert double hyphens to em dash
244            (r"--+", "—"),
245            # Add em dash between words with spaces around
246            (r"(\w\s+)-(\s+\w)", r"\1—\2"),
247            # Add ellipsis character for three dots with a space before and after
248            (r"(\w)\s*\.{3}", r"\1 …"),
249            # Dedupe repeating marks (e.g., "!!" -> "!", "???" -> "?")
250            (r"([.,;:!?–—]){2,}", r"\1"),
251            # Remove trailing separator before closing bracket, e.g. (RAM;) -> (RAM)
252            (r"([A-Za-z0-9])\s*[;,:]\s*([\)\]\}])", r"\1\2"),
253        ]
254        for pattern, replacement in replacements:
255            txt = re.sub(pattern, replacement, txt)
256
257        txt = self._smartDoubleQuotes(txt)
258
259        self.text = txt.strip()
260        return self.text if input is not None else self
261
262    def removeOutsidePunctuation(self, input: str = None) -> Union[str, "KTextCleaner"]:
263        """Remove punctuation characters at the start or end of the text."""
264        txt = self.get(input)
265
266        punctuationToRemove = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~¡¿“”‘’«»"""
267        txt = re.sub(rf"^[\s{punctuationToRemove}]+", "", txt)  # Leading punctuation
268        txt = re.sub(rf"[\s{punctuationToRemove}]+$", "", txt)  # Trailing punctuation
269
270        self.text = txt.strip()
271        return self.text if input is not None else self
272
273    def removeOrphanedPunctuation(
274        self, input: str = None
275    ) -> Union[str, "KTextCleaner"]:
276        """Remove punctuation characters without relevant counterpart."""
277        txt = self.get(input)
278
279        # Stack-based approach for proper matching
280        pairs = {"(": ")", "[": "]", "{": "}", "“": "”", "‘": "’", "<": ">", "«": "»"}
281        stack = []
282        to_remove = set()
283
284        # Find positions of unmatched brackets
285        for i, char in enumerate(txt):
286            if char in pairs:
287                stack.append((i, char))
288            elif char in pairs.values():
289                if stack and pairs[stack[-1][1]] == char:
290                    stack.pop()
291                else:
292                    to_remove.add(i)  # Unmatched closing bracket
293
294        # Remaining in stack are unmatched opening brackets
295        to_remove.update(pos for pos, _ in stack)
296
297        # Build result without unmatched brackets
298        txt = "".join(char for i, char in enumerate(txt) if i not in to_remove)
299
300        txt = txt.lstrip(
301            ",.:;!?"
302        )  # Also remove common leading punctuation if it becomes leading after cleanup
303        self.text = txt.strip()
304        return self.text if input is not None else self
305
306    def cleanWhitespace(self, input: str = None) -> Union[str, "KTextCleaner"]:
307        """Clean up whitespace around punctuation and condense multiple spaces."""
308        txt = self.get(input)
309
310        replacements = [
311            # Replace multiple spaces with a single space
312            (r"[ \t]{2,}", " "),
313            # Replace multiple line-breaks with a single line-break
314            (r"\n{3,}", "\n\n"),
315            # Remove space before punctuation
316            (r"\s+([.,;:!?])", r"\1"),
317            # Remove space after opening punctuation
318            (r"([\(\[\{¡¿])\s+", r"\1"),
319            # Remove space before closing punctuation
320            (r"\s+([\)\]\}])", r"\1"),
321            # Ensure space after sentence-ending punctuation if followed by capital letter
322            (r"([a-z]+)([.?!])([A-Z][a-z]+\b)", r"\1\2 \3"),
323        ]
324        for pattern, replacement in replacements:
325            txt = re.sub(pattern, replacement, txt)
326
327        self.text = txt.strip()
328        return self.text if input is not None else self
329
330    def cleanExtra(self, input: str = None) -> Union[str, "KTextCleaner"]:
331        # Remove unwanted characters (example: non-ASCII)
332        txt = self.get(input)
333        txt = re.sub(r"[^\x00-\x7F]+", "", txt)
334
335        self.text = txt.strip()
336        return self.text if input is not None else self
337
338    def sanitizeForbidden(
339        self,
340        input: str | list[str] = None,
341        dropStrategy: Literal["word", "sentence"] = "word",
342        extendTerms: list[str] = None,
343    ) -> Union[str, list[str], "KTextCleaner"]:
344        """
345        Sanitize text using the forbidden-terms list.
346
347        Args:
348            input: Input text or list of strings to sanitize.
349            dropStrategy: `word` to remove only incriminating words,
350                `sentence` to drop any sentence containing an incriminating word.
351            extendTerms: Optional list of additional forbidden terms.
352        """
353        txt = self.get(input)
354        terms = self._getForbiddenTerms(extendTerms)
355
356        if dropStrategy not in ["word", "sentence"]:
357            raise ValueError(
358                "dropStrategy must be either 'word' or 'sentence', got "
359                + f"{dropStrategy}"
360            )
361
362        def _containsForbidden(item: str) -> bool:
363            itemLower = item.lower()
364            return any(term in itemLower for term in terms)
365
366        def _normalizeWhitespace(value: str) -> str:
367            replacements = [
368                (r"[ \t]{2,}", " "),
369                (r"\n{3,}", "\n\n"),
370                (r"\s+([.,;:!?])", r"\1"),
371                (r"([\(\[\{¡¿])\s+", r"\1"),
372                (r"\s+([\)\]\}])", r"\1"),
373                (r"([a-z]+)([.?!])([A-Z][a-z]+\b)", r"\1\2 \3"),
374            ]
375            for pattern, replacement in replacements:
376                value = re.sub(pattern, replacement, value)
377            return value.strip()
378
379        def _sanitizeString(value: str) -> str:
380            sentences = content.splitStringToSentences(value)
381
382            if dropStrategy == "sentence":
383                sentences = [s for s in sentences if not _containsForbidden(s)]
384            else:
385                cleanedSentences = []
386                for sentence in sentences:
387                    words = sentence.split(" ")
388                    words = [
389                        word for word in words if word and not _containsForbidden(word)
390                    ]
391                    if words:
392                        cleanedSentences.append(" ".join(words))
393                sentences = cleanedSentences
394
395            cleanValue = " ".join(sentences)
396            return _normalizeWhitespace(cleanValue)
397
398        if isinstance(txt, list):
399            if dropStrategy == "sentence":
400                output = [
401                    _normalizeWhitespace(item)
402                    for item in txt
403                    if not _containsForbidden(item) and _normalizeWhitespace(item)
404                ]
405            else:
406                output = []
407                for item in txt:
408                    cleanItem = _sanitizeString(item)
409                    if cleanItem:
410                        output.append(cleanItem)
411            self.text = output
412        else:
413            output = _sanitizeString(txt)
414            self.text = output.strip()
415
416        return self.text if input is not None else self
class KTextCleaner:
 10class KTextCleaner:
 11    FORBIDDEN_TERMS_PATH = "/Users/christianjansky/Library/CloudStorage/Dropbox/KOMETA-Draw/01 Content/forbidden-any.txt"
 12
 13    def __init__(self, text: str | list[str] = None):
 14        """Initialize and optionally set text to be cleaned (will return self for chaining)."""
 15        self.text = text
 16
 17    def _getForbiddenTerms(self, extendTerms: list[str] = None) -> list[str]:
 18        """Load forbidden terms from default file and optionally extend the list."""
 19        with open(self.FORBIDDEN_TERMS_PATH, encoding="utf-8") as f:
 20            terms = f.read().splitlines()
 21
 22        if extendTerms:
 23            terms.extend(extendTerms)
 24
 25        cleanTerms = []
 26        for term in terms:
 27            cleaned = term.strip().lower()
 28            if cleaned and cleaned not in cleanTerms:
 29                cleanTerms.append(cleaned)
 30
 31        return cleanTerms
 32
 33    def get(self, newText: str | list[str] = None) -> str | list[str]:
 34        """Return the current text or set a new text."""
 35        if newText is not None:
 36            self.text = newText
 37        if self.text is None:
 38            raise ValueError("No text available for cleaning.")
 39        return self.text
 40
 41    def cleanWikipedia(self, input: str = None) -> Union[str, "KTextCleaner"]:
 42        """Clean Wikipedia-specific markup from text."""
 43        txt = self.get(input)
 44
 45        replacements = [
 46            # Add space when citation is directly followed by lowercase letter
 47            (r"(\[\d+(?::?\d+(?:[-–]\d+)?)?\])([a-z])", r"\1 \2"),
 48            (r"(\[(?:[Nn]ote|[Nn]?[Bb]|n)\s+\d+\])([a-z])", r"\1 \2"),
 49            (r"(\[[a-z]\])([a-z])", r"\1 \2"),
 50            # Replace section headers like == Header == with just "Header."
 51            (r"^={2,}\s*(.*?)\s*={2,}$", r"\1."),
 52        ]
 53        for pattern, replacement in replacements:
 54            txt = re.sub(
 55                pattern, replacement, txt, flags=re.MULTILINE
 56            )  # MULTILINE for ^ $ to work line-wise
 57
 58        removals = [
 59            # Remove citation references preceeded by non-whitespace: [1][2], [3]:15
 60            r"(?<=\S)\[\d+\](?::?\d+(?:[-–]\d+)?)?",
 61            # Remove note references: [note 1], [n 1], [nb 1], [NB 1], [Note 1]
 62            r"\[(?:[Nn]ote|[Nn]?[Bb]|n)\s+\d+\]",
 63            # Remove single letter notes: [a], [b], etc.
 64            r"(?<=\S)\[[a-z]\]",
 65            # Remove editorial tags
 66            r"(\{{2}|\[)[Cc]itation( needed)?(\}{2}|\])",
 67            r"(\{{2}|\[)[Cc]larification( needed)?(\}{2}|\])",
 68            r"(\{{2}|\[)[Vv]erification( needed)?(\}{2}|\])",
 69            r"(\{{2}|\[)[Cc]ite [Bb]ook(\}{2}|\])",
 70            # Remove IPA pronunciations only (avoid removing regular word/word slashes)
 71            # Also consumes optional surrounding punctuation spacing (e.g. "; /ræm/").
 72            r"(?:\s*[;,]\s*)?/(?=[^/\n]{1,80}/)(?=[^/\n]*[ˈˌːɪʊəɛæɑɔʃʒθðŋɡɹɾʔ])[^/\n]{1,80}/(?:\s*[,;])?",
 73        ]
 74        for removal in removals:
 75            txt = re.sub(rf"{removal}", "", txt)
 76
 77        # Minimal cleanup for whitespace artifacts introduced by removals above.
 78        cleanupReplacements = [
 79            # Prevent leading whitespace right after opening punctuation, e.g. "( born"
 80            (r"([\(\[\{])\s+", r"\1"),
 81            # Remove whitespace before punctuation introduced by token deletion
 82            (r"\s+([,.;:!?])", r"\1"),
 83            # Collapse repeated horizontal spaces introduced by removals
 84            (r"[ \t]{2,}", " "),
 85        ]
 86        for pattern, replacement in cleanupReplacements:
 87            txt = re.sub(pattern, replacement, txt)
 88        self.text = txt.strip()
 89        # ! input = '' is falsy and would return self instead of cleaned text, so check explicitly for None
 90        # ? May return empty string ''
 91        return self.text if input is not None else self
 92
 93    def removeLaTeX(self, input: str = None) -> Union[str, "KTextCleaner"]:
 94        """Remove LaTeX math markup and remnants while preserving prose."""
 95        txt = self.get(input)
 96
 97        def _dropStyledMathBlocks(value: str) -> str:
 98            # Remove complete style wrappers such as {\displaystyle ...} / {\textstyle ...}
 99            stylePattern = re.compile(r"\{\\(?:display|text)style\b")
100
101            while True:
102                match = stylePattern.search(value)
103                if not match:
104                    break
105
106                start = match.start()
107                i = match.end()
108                depth = 1
109
110                while i < len(value) and depth > 0:
111                    char = value[i]
112                    if char == "{":
113                        depth += 1
114                    elif char == "}":
115                        depth -= 1
116                    i += 1
117
118                if depth == 0:
119                    value = value[:start] + " " + value[i:]
120                else:
121                    # Truncated/malformed block: remove to nearest sentence break.
122                    sentenceBreak = re.search(r"[.!?\n]", value[start:])
123                    if sentenceBreak:
124                        end = start + sentenceBreak.start()
125                        value = value[:start] + " " + value[end:]
126                    else:
127                        value = value[:start]
128                        break
129
130            return value
131
132        def _dropLatexCommandRuns(value: str) -> str:
133            # Remove command groups with braced arguments (including nested via iteration).
134            previous = None
135            while previous != value:
136                previous = value
137                value = re.sub(r"\\[A-Za-z]+\*?\s*\{[^{}]*\}", " ", value)
138
139            # Remove heavy command runs likely to be formula payloads.
140            commandRun = re.compile(
141                r"(?:\\(?:frac|sqrt|text|mathrm|mathbf|hat|vec|cdot|times|int|sum|prod|partial|nabla|left|right|lVert|rVert|alpha|beta|gamma|delta|epsilon|theta|lambda|mu|nu|pi|rho|sigma|tau|phi|psi|omega|Delta|Omega|to|propto|leq|geq|neq|approx|pm|mp|infty|sin|cos|tan|log|ln)\b"
142                r"|[_^]\{[^{}]*\}"
143                r"|[_^][A-Za-z0-9])"
144                r"(?:[\s{}()\[\],.=:+\-*/<>|]|\\[A-Za-z]+|\d)*"
145            )
146            value = commandRun.sub(" ", value)
147
148            # Remove any leftover escaped commands.
149            value = re.sub(r"\\[A-Za-z]+\*?", " ", value)
150            return value
151
152        def _dropFormulaLikeBraces(value: str) -> str:
153            # Remove brace blocks with formula-like symbols/structure.
154            braceFormula = re.compile(r"\{[^{}]*(?:[_^=]|\\|\d\s*[+\-*/=])[^{}]*\}")
155
156            previous = None
157            while previous != value:
158                previous = value
159                value = braceFormula.sub(" ", value)
160
161            return value
162
163        txt = _dropStyledMathBlocks(txt)
164        txt = _dropLatexCommandRuns(txt)
165        txt = _dropFormulaLikeBraces(txt)
166
167        # Remove obvious punctuation remnants from stripped math chunks.
168        txt = re.sub(r"\(\s*[;,:\-–—]*\s*\)", "", txt)
169        txt = re.sub(r"\[\s*[;,:\-–—]*\s*\]", "", txt)
170        txt = re.sub(r"\{\s*[;,:\-–—]*\s*\}", "", txt)
171        txt = re.sub(r"[{}]", " ", txt)
172        txt = re.sub(r"\s+([.,;:!?])", r"\1", txt)
173        txt = re.sub(r"([.,;:!?]){2,}", r"\1", txt)
174        txt = re.sub(r"[ \t]*\n[ \t]*", "\n", txt)
175        txt = re.sub(r"[ \t]{2,}", " ", txt)
176        txt = re.sub(r"\n{3,}", "\n\n", txt)
177
178        self.text = txt.strip()
179        return self.text if input is not None else self
180
181    def cleanPunctuation(self, input: str = None) -> Union[str, "KTextCleaner"]:
182        """Remove empty parentheses, brackets, and braces; condense multiple punctuation marks."""
183        txt = self.get(input)
184
185        replacements = [
186            # Remove empty parentheses, brackets, and braces
187            (r"[\(\[\{]\s*[\)\]\}]", ""),
188            # Replace multiple punctuation marks with a single one
189            (r"([.,;:!?]){2,}", r"\1"),
190        ]
191        for pattern, replacement in replacements:
192            txt = re.sub(pattern, replacement, txt)
193
194        self.text = txt.strip()
195        return self.text if input is not None else self
196
197    def _smartDoubleQuotes(self, txt: str) -> str:
198        """Convert dumb double quotes to smart quotes with basic nesting support."""
199        opening_context = set(" \t\n\r([{-–—/\\“‘«")
200        result = []
201        length = len(txt)
202
203        for i, char in enumerate(txt):
204            if char != '"':
205                result.append(char)
206                continue
207
208            prev_char = txt[i - 1] if i > 0 else ""
209            next_char = txt[i + 1] if i + 1 < length else ""
210
211            # Keep measurement marks like 11" unchanged.
212            if prev_char.isdigit():
213                result.append(char)
214                continue
215
216            is_opening = (
217                i == 0
218                or prev_char in opening_context
219                or (prev_char in ":;,.!?" and next_char and next_char.isalpha())
220            )
221            result.append("“" if is_opening else "”")
222
223        return "".join(result)
224
225    def improvePunctuation(self, input: str = None) -> Union[str, "KTextCleaner"]:
226        """Convert dumb quotes to smart quotes."""
227        txt = self.get(input)
228
229        replacements = [
230            # Apostrophes between letters (e.g., Wet'n'Wild, O'Neill)
231            (r"(?<=[A-Za-z])'(?=[A-Za-z])", "’"),
232            # Apostrophes in contractions and possessives
233            (r"\b([A-Za-z]+)'([A-Za-z]*)\b", r"\1’\2"),
234            # Single quotes (not measurements like 5', not after digits) for quotes around words (at least 2 characters to avoid contractions like ’n’)
235            (r"(?<!\w)(?<!\d)'([^']{2,}?)'(?!\d)", r"‘\1’"),
236            # Single quote used as apostrophe for s (e.g., Chris')
237            (r"([A-Za-z])'(\s)\b", r"\1’\2"),
238            # Replace hyphen with en-dash for number ranges (years in parens, dates, etc - not math)
239            (r"(\d{4})\s*-\s*(\d{4})", r"\1–\2"),
240            # Abbreviate shortened year NOT preceded by digit with apostrophe, e.g., '95 or '60s
241            (r"(?<!\d)'(\d{2})(s?)\b", r"’\1\2"),
242            # Add apostrophe for word contractions: 'tis -> ’tis
243            (r"(?<!\w)'([A-Za-z]+)\b", r"’\1"),
244            # Convert double hyphens to em dash
245            (r"--+", "—"),
246            # Add em dash between words with spaces around
247            (r"(\w\s+)-(\s+\w)", r"\1—\2"),
248            # Add ellipsis character for three dots with a space before and after
249            (r"(\w)\s*\.{3}", r"\1 …"),
250            # Dedupe repeating marks (e.g., "!!" -> "!", "???" -> "?")
251            (r"([.,;:!?–—]){2,}", r"\1"),
252            # Remove trailing separator before closing bracket, e.g. (RAM;) -> (RAM)
253            (r"([A-Za-z0-9])\s*[;,:]\s*([\)\]\}])", r"\1\2"),
254        ]
255        for pattern, replacement in replacements:
256            txt = re.sub(pattern, replacement, txt)
257
258        txt = self._smartDoubleQuotes(txt)
259
260        self.text = txt.strip()
261        return self.text if input is not None else self
262
263    def removeOutsidePunctuation(self, input: str = None) -> Union[str, "KTextCleaner"]:
264        """Remove punctuation characters at the start or end of the text."""
265        txt = self.get(input)
266
267        punctuationToRemove = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~¡¿“”‘’«»"""
268        txt = re.sub(rf"^[\s{punctuationToRemove}]+", "", txt)  # Leading punctuation
269        txt = re.sub(rf"[\s{punctuationToRemove}]+$", "", txt)  # Trailing punctuation
270
271        self.text = txt.strip()
272        return self.text if input is not None else self
273
274    def removeOrphanedPunctuation(
275        self, input: str = None
276    ) -> Union[str, "KTextCleaner"]:
277        """Remove punctuation characters without relevant counterpart."""
278        txt = self.get(input)
279
280        # Stack-based approach for proper matching
281        pairs = {"(": ")", "[": "]", "{": "}", "“": "”", "‘": "’", "<": ">", "«": "»"}
282        stack = []
283        to_remove = set()
284
285        # Find positions of unmatched brackets
286        for i, char in enumerate(txt):
287            if char in pairs:
288                stack.append((i, char))
289            elif char in pairs.values():
290                if stack and pairs[stack[-1][1]] == char:
291                    stack.pop()
292                else:
293                    to_remove.add(i)  # Unmatched closing bracket
294
295        # Remaining in stack are unmatched opening brackets
296        to_remove.update(pos for pos, _ in stack)
297
298        # Build result without unmatched brackets
299        txt = "".join(char for i, char in enumerate(txt) if i not in to_remove)
300
301        txt = txt.lstrip(
302            ",.:;!?"
303        )  # Also remove common leading punctuation if it becomes leading after cleanup
304        self.text = txt.strip()
305        return self.text if input is not None else self
306
307    def cleanWhitespace(self, input: str = None) -> Union[str, "KTextCleaner"]:
308        """Clean up whitespace around punctuation and condense multiple spaces."""
309        txt = self.get(input)
310
311        replacements = [
312            # Replace multiple spaces with a single space
313            (r"[ \t]{2,}", " "),
314            # Replace multiple line-breaks with a single line-break
315            (r"\n{3,}", "\n\n"),
316            # Remove space before punctuation
317            (r"\s+([.,;:!?])", r"\1"),
318            # Remove space after opening punctuation
319            (r"([\(\[\{¡¿])\s+", r"\1"),
320            # Remove space before closing punctuation
321            (r"\s+([\)\]\}])", r"\1"),
322            # Ensure space after sentence-ending punctuation if followed by capital letter
323            (r"([a-z]+)([.?!])([A-Z][a-z]+\b)", r"\1\2 \3"),
324        ]
325        for pattern, replacement in replacements:
326            txt = re.sub(pattern, replacement, txt)
327
328        self.text = txt.strip()
329        return self.text if input is not None else self
330
331    def cleanExtra(self, input: str = None) -> Union[str, "KTextCleaner"]:
332        # Remove unwanted characters (example: non-ASCII)
333        txt = self.get(input)
334        txt = re.sub(r"[^\x00-\x7F]+", "", txt)
335
336        self.text = txt.strip()
337        return self.text if input is not None else self
338
339    def sanitizeForbidden(
340        self,
341        input: str | list[str] = None,
342        dropStrategy: Literal["word", "sentence"] = "word",
343        extendTerms: list[str] = None,
344    ) -> Union[str, list[str], "KTextCleaner"]:
345        """
346        Sanitize text using the forbidden-terms list.
347
348        Args:
349            input: Input text or list of strings to sanitize.
350            dropStrategy: `word` to remove only incriminating words,
351                `sentence` to drop any sentence containing an incriminating word.
352            extendTerms: Optional list of additional forbidden terms.
353        """
354        txt = self.get(input)
355        terms = self._getForbiddenTerms(extendTerms)
356
357        if dropStrategy not in ["word", "sentence"]:
358            raise ValueError(
359                "dropStrategy must be either 'word' or 'sentence', got "
360                + f"{dropStrategy}"
361            )
362
363        def _containsForbidden(item: str) -> bool:
364            itemLower = item.lower()
365            return any(term in itemLower for term in terms)
366
367        def _normalizeWhitespace(value: str) -> str:
368            replacements = [
369                (r"[ \t]{2,}", " "),
370                (r"\n{3,}", "\n\n"),
371                (r"\s+([.,;:!?])", r"\1"),
372                (r"([\(\[\{¡¿])\s+", r"\1"),
373                (r"\s+([\)\]\}])", r"\1"),
374                (r"([a-z]+)([.?!])([A-Z][a-z]+\b)", r"\1\2 \3"),
375            ]
376            for pattern, replacement in replacements:
377                value = re.sub(pattern, replacement, value)
378            return value.strip()
379
380        def _sanitizeString(value: str) -> str:
381            sentences = content.splitStringToSentences(value)
382
383            if dropStrategy == "sentence":
384                sentences = [s for s in sentences if not _containsForbidden(s)]
385            else:
386                cleanedSentences = []
387                for sentence in sentences:
388                    words = sentence.split(" ")
389                    words = [
390                        word for word in words if word and not _containsForbidden(word)
391                    ]
392                    if words:
393                        cleanedSentences.append(" ".join(words))
394                sentences = cleanedSentences
395
396            cleanValue = " ".join(sentences)
397            return _normalizeWhitespace(cleanValue)
398
399        if isinstance(txt, list):
400            if dropStrategy == "sentence":
401                output = [
402                    _normalizeWhitespace(item)
403                    for item in txt
404                    if not _containsForbidden(item) and _normalizeWhitespace(item)
405                ]
406            else:
407                output = []
408                for item in txt:
409                    cleanItem = _sanitizeString(item)
410                    if cleanItem:
411                        output.append(cleanItem)
412            self.text = output
413        else:
414            output = _sanitizeString(txt)
415            self.text = output.strip()
416
417        return self.text if input is not None else self
KTextCleaner(text: str | list[str] = None)
13    def __init__(self, text: str | list[str] = None):
14        """Initialize and optionally set text to be cleaned (will return self for chaining)."""
15        self.text = text

Initialize and optionally set text to be cleaned (will return self for chaining).

FORBIDDEN_TERMS_PATH = '/Users/christianjansky/Library/CloudStorage/Dropbox/KOMETA-Draw/01 Content/forbidden-any.txt'
text
def get(self, newText: str | list[str] = None) -> str | list[str]:
33    def get(self, newText: str | list[str] = None) -> str | list[str]:
34        """Return the current text or set a new text."""
35        if newText is not None:
36            self.text = newText
37        if self.text is None:
38            raise ValueError("No text available for cleaning.")
39        return self.text

Return the current text or set a new text.

def cleanWikipedia( self, input: str = None) -> Union[str, KTextCleaner]:
41    def cleanWikipedia(self, input: str = None) -> Union[str, "KTextCleaner"]:
42        """Clean Wikipedia-specific markup from text."""
43        txt = self.get(input)
44
45        replacements = [
46            # Add space when citation is directly followed by lowercase letter
47            (r"(\[\d+(?::?\d+(?:[-–]\d+)?)?\])([a-z])", r"\1 \2"),
48            (r"(\[(?:[Nn]ote|[Nn]?[Bb]|n)\s+\d+\])([a-z])", r"\1 \2"),
49            (r"(\[[a-z]\])([a-z])", r"\1 \2"),
50            # Replace section headers like == Header == with just "Header."
51            (r"^={2,}\s*(.*?)\s*={2,}$", r"\1."),
52        ]
53        for pattern, replacement in replacements:
54            txt = re.sub(
55                pattern, replacement, txt, flags=re.MULTILINE
56            )  # MULTILINE for ^ $ to work line-wise
57
58        removals = [
59            # Remove citation references preceeded by non-whitespace: [1][2], [3]:15
60            r"(?<=\S)\[\d+\](?::?\d+(?:[-–]\d+)?)?",
61            # Remove note references: [note 1], [n 1], [nb 1], [NB 1], [Note 1]
62            r"\[(?:[Nn]ote|[Nn]?[Bb]|n)\s+\d+\]",
63            # Remove single letter notes: [a], [b], etc.
64            r"(?<=\S)\[[a-z]\]",
65            # Remove editorial tags
66            r"(\{{2}|\[)[Cc]itation( needed)?(\}{2}|\])",
67            r"(\{{2}|\[)[Cc]larification( needed)?(\}{2}|\])",
68            r"(\{{2}|\[)[Vv]erification( needed)?(\}{2}|\])",
69            r"(\{{2}|\[)[Cc]ite [Bb]ook(\}{2}|\])",
70            # Remove IPA pronunciations only (avoid removing regular word/word slashes)
71            # Also consumes optional surrounding punctuation spacing (e.g. "; /ræm/").
72            r"(?:\s*[;,]\s*)?/(?=[^/\n]{1,80}/)(?=[^/\n]*[ˈˌːɪʊəɛæɑɔʃʒθðŋɡɹɾʔ])[^/\n]{1,80}/(?:\s*[,;])?",
73        ]
74        for removal in removals:
75            txt = re.sub(rf"{removal}", "", txt)
76
77        # Minimal cleanup for whitespace artifacts introduced by removals above.
78        cleanupReplacements = [
79            # Prevent leading whitespace right after opening punctuation, e.g. "( born"
80            (r"([\(\[\{])\s+", r"\1"),
81            # Remove whitespace before punctuation introduced by token deletion
82            (r"\s+([,.;:!?])", r"\1"),
83            # Collapse repeated horizontal spaces introduced by removals
84            (r"[ \t]{2,}", " "),
85        ]
86        for pattern, replacement in cleanupReplacements:
87            txt = re.sub(pattern, replacement, txt)
88        self.text = txt.strip()
89        # ! input = '' is falsy and would return self instead of cleaned text, so check explicitly for None
90        # ? May return empty string ''
91        return self.text if input is not None else self

Clean Wikipedia-specific markup from text.

def removeLaTeX( self, input: str = None) -> Union[str, KTextCleaner]:
 93    def removeLaTeX(self, input: str = None) -> Union[str, "KTextCleaner"]:
 94        """Remove LaTeX math markup and remnants while preserving prose."""
 95        txt = self.get(input)
 96
 97        def _dropStyledMathBlocks(value: str) -> str:
 98            # Remove complete style wrappers such as {\displaystyle ...} / {\textstyle ...}
 99            stylePattern = re.compile(r"\{\\(?:display|text)style\b")
100
101            while True:
102                match = stylePattern.search(value)
103                if not match:
104                    break
105
106                start = match.start()
107                i = match.end()
108                depth = 1
109
110                while i < len(value) and depth > 0:
111                    char = value[i]
112                    if char == "{":
113                        depth += 1
114                    elif char == "}":
115                        depth -= 1
116                    i += 1
117
118                if depth == 0:
119                    value = value[:start] + " " + value[i:]
120                else:
121                    # Truncated/malformed block: remove to nearest sentence break.
122                    sentenceBreak = re.search(r"[.!?\n]", value[start:])
123                    if sentenceBreak:
124                        end = start + sentenceBreak.start()
125                        value = value[:start] + " " + value[end:]
126                    else:
127                        value = value[:start]
128                        break
129
130            return value
131
132        def _dropLatexCommandRuns(value: str) -> str:
133            # Remove command groups with braced arguments (including nested via iteration).
134            previous = None
135            while previous != value:
136                previous = value
137                value = re.sub(r"\\[A-Za-z]+\*?\s*\{[^{}]*\}", " ", value)
138
139            # Remove heavy command runs likely to be formula payloads.
140            commandRun = re.compile(
141                r"(?:\\(?:frac|sqrt|text|mathrm|mathbf|hat|vec|cdot|times|int|sum|prod|partial|nabla|left|right|lVert|rVert|alpha|beta|gamma|delta|epsilon|theta|lambda|mu|nu|pi|rho|sigma|tau|phi|psi|omega|Delta|Omega|to|propto|leq|geq|neq|approx|pm|mp|infty|sin|cos|tan|log|ln)\b"
142                r"|[_^]\{[^{}]*\}"
143                r"|[_^][A-Za-z0-9])"
144                r"(?:[\s{}()\[\],.=:+\-*/<>|]|\\[A-Za-z]+|\d)*"
145            )
146            value = commandRun.sub(" ", value)
147
148            # Remove any leftover escaped commands.
149            value = re.sub(r"\\[A-Za-z]+\*?", " ", value)
150            return value
151
152        def _dropFormulaLikeBraces(value: str) -> str:
153            # Remove brace blocks with formula-like symbols/structure.
154            braceFormula = re.compile(r"\{[^{}]*(?:[_^=]|\\|\d\s*[+\-*/=])[^{}]*\}")
155
156            previous = None
157            while previous != value:
158                previous = value
159                value = braceFormula.sub(" ", value)
160
161            return value
162
163        txt = _dropStyledMathBlocks(txt)
164        txt = _dropLatexCommandRuns(txt)
165        txt = _dropFormulaLikeBraces(txt)
166
167        # Remove obvious punctuation remnants from stripped math chunks.
168        txt = re.sub(r"\(\s*[;,:\-–—]*\s*\)", "", txt)
169        txt = re.sub(r"\[\s*[;,:\-–—]*\s*\]", "", txt)
170        txt = re.sub(r"\{\s*[;,:\-–—]*\s*\}", "", txt)
171        txt = re.sub(r"[{}]", " ", txt)
172        txt = re.sub(r"\s+([.,;:!?])", r"\1", txt)
173        txt = re.sub(r"([.,;:!?]){2,}", r"\1", txt)
174        txt = re.sub(r"[ \t]*\n[ \t]*", "\n", txt)
175        txt = re.sub(r"[ \t]{2,}", " ", txt)
176        txt = re.sub(r"\n{3,}", "\n\n", txt)
177
178        self.text = txt.strip()
179        return self.text if input is not None else self

Remove LaTeX math markup and remnants while preserving prose.

def cleanPunctuation( self, input: str = None) -> Union[str, KTextCleaner]:
181    def cleanPunctuation(self, input: str = None) -> Union[str, "KTextCleaner"]:
182        """Remove empty parentheses, brackets, and braces; condense multiple punctuation marks."""
183        txt = self.get(input)
184
185        replacements = [
186            # Remove empty parentheses, brackets, and braces
187            (r"[\(\[\{]\s*[\)\]\}]", ""),
188            # Replace multiple punctuation marks with a single one
189            (r"([.,;:!?]){2,}", r"\1"),
190        ]
191        for pattern, replacement in replacements:
192            txt = re.sub(pattern, replacement, txt)
193
194        self.text = txt.strip()
195        return self.text if input is not None else self

Remove empty parentheses, brackets, and braces; condense multiple punctuation marks.

def improvePunctuation( self, input: str = None) -> Union[str, KTextCleaner]:
225    def improvePunctuation(self, input: str = None) -> Union[str, "KTextCleaner"]:
226        """Convert dumb quotes to smart quotes."""
227        txt = self.get(input)
228
229        replacements = [
230            # Apostrophes between letters (e.g., Wet'n'Wild, O'Neill)
231            (r"(?<=[A-Za-z])'(?=[A-Za-z])", "’"),
232            # Apostrophes in contractions and possessives
233            (r"\b([A-Za-z]+)'([A-Za-z]*)\b", r"\1’\2"),
234            # Single quotes (not measurements like 5', not after digits) for quotes around words (at least 2 characters to avoid contractions like ’n’)
235            (r"(?<!\w)(?<!\d)'([^']{2,}?)'(?!\d)", r"‘\1’"),
236            # Single quote used as apostrophe for s (e.g., Chris')
237            (r"([A-Za-z])'(\s)\b", r"\1’\2"),
238            # Replace hyphen with en-dash for number ranges (years in parens, dates, etc - not math)
239            (r"(\d{4})\s*-\s*(\d{4})", r"\1–\2"),
240            # Abbreviate shortened year NOT preceded by digit with apostrophe, e.g., '95 or '60s
241            (r"(?<!\d)'(\d{2})(s?)\b", r"’\1\2"),
242            # Add apostrophe for word contractions: 'tis -> ’tis
243            (r"(?<!\w)'([A-Za-z]+)\b", r"’\1"),
244            # Convert double hyphens to em dash
245            (r"--+", "—"),
246            # Add em dash between words with spaces around
247            (r"(\w\s+)-(\s+\w)", r"\1—\2"),
248            # Add ellipsis character for three dots with a space before and after
249            (r"(\w)\s*\.{3}", r"\1 …"),
250            # Dedupe repeating marks (e.g., "!!" -> "!", "???" -> "?")
251            (r"([.,;:!?–—]){2,}", r"\1"),
252            # Remove trailing separator before closing bracket, e.g. (RAM;) -> (RAM)
253            (r"([A-Za-z0-9])\s*[;,:]\s*([\)\]\}])", r"\1\2"),
254        ]
255        for pattern, replacement in replacements:
256            txt = re.sub(pattern, replacement, txt)
257
258        txt = self._smartDoubleQuotes(txt)
259
260        self.text = txt.strip()
261        return self.text if input is not None else self

Convert dumb quotes to smart quotes.

def removeOutsidePunctuation( self, input: str = None) -> Union[str, KTextCleaner]:
263    def removeOutsidePunctuation(self, input: str = None) -> Union[str, "KTextCleaner"]:
264        """Remove punctuation characters at the start or end of the text."""
265        txt = self.get(input)
266
267        punctuationToRemove = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~¡¿“”‘’«»"""
268        txt = re.sub(rf"^[\s{punctuationToRemove}]+", "", txt)  # Leading punctuation
269        txt = re.sub(rf"[\s{punctuationToRemove}]+$", "", txt)  # Trailing punctuation
270
271        self.text = txt.strip()
272        return self.text if input is not None else self

Remove punctuation characters at the start or end of the text.

def removeOrphanedPunctuation( self, input: str = None) -> Union[str, KTextCleaner]:
274    def removeOrphanedPunctuation(
275        self, input: str = None
276    ) -> Union[str, "KTextCleaner"]:
277        """Remove punctuation characters without relevant counterpart."""
278        txt = self.get(input)
279
280        # Stack-based approach for proper matching
281        pairs = {"(": ")", "[": "]", "{": "}", "“": "”", "‘": "’", "<": ">", "«": "»"}
282        stack = []
283        to_remove = set()
284
285        # Find positions of unmatched brackets
286        for i, char in enumerate(txt):
287            if char in pairs:
288                stack.append((i, char))
289            elif char in pairs.values():
290                if stack and pairs[stack[-1][1]] == char:
291                    stack.pop()
292                else:
293                    to_remove.add(i)  # Unmatched closing bracket
294
295        # Remaining in stack are unmatched opening brackets
296        to_remove.update(pos for pos, _ in stack)
297
298        # Build result without unmatched brackets
299        txt = "".join(char for i, char in enumerate(txt) if i not in to_remove)
300
301        txt = txt.lstrip(
302            ",.:;!?"
303        )  # Also remove common leading punctuation if it becomes leading after cleanup
304        self.text = txt.strip()
305        return self.text if input is not None else self

Remove punctuation characters without relevant counterpart.

def cleanWhitespace( self, input: str = None) -> Union[str, KTextCleaner]:
307    def cleanWhitespace(self, input: str = None) -> Union[str, "KTextCleaner"]:
308        """Clean up whitespace around punctuation and condense multiple spaces."""
309        txt = self.get(input)
310
311        replacements = [
312            # Replace multiple spaces with a single space
313            (r"[ \t]{2,}", " "),
314            # Replace multiple line-breaks with a single line-break
315            (r"\n{3,}", "\n\n"),
316            # Remove space before punctuation
317            (r"\s+([.,;:!?])", r"\1"),
318            # Remove space after opening punctuation
319            (r"([\(\[\{¡¿])\s+", r"\1"),
320            # Remove space before closing punctuation
321            (r"\s+([\)\]\}])", r"\1"),
322            # Ensure space after sentence-ending punctuation if followed by capital letter
323            (r"([a-z]+)([.?!])([A-Z][a-z]+\b)", r"\1\2 \3"),
324        ]
325        for pattern, replacement in replacements:
326            txt = re.sub(pattern, replacement, txt)
327
328        self.text = txt.strip()
329        return self.text if input is not None else self

Clean up whitespace around punctuation and condense multiple spaces.

def cleanExtra( self, input: str = None) -> Union[str, KTextCleaner]:
331    def cleanExtra(self, input: str = None) -> Union[str, "KTextCleaner"]:
332        # Remove unwanted characters (example: non-ASCII)
333        txt = self.get(input)
334        txt = re.sub(r"[^\x00-\x7F]+", "", txt)
335
336        self.text = txt.strip()
337        return self.text if input is not None else self
def sanitizeForbidden( self, input: str | list[str] = None, dropStrategy: Literal['word', 'sentence'] = 'word', extendTerms: list[str] = None) -> Union[str, list[str], KTextCleaner]:
339    def sanitizeForbidden(
340        self,
341        input: str | list[str] = None,
342        dropStrategy: Literal["word", "sentence"] = "word",
343        extendTerms: list[str] = None,
344    ) -> Union[str, list[str], "KTextCleaner"]:
345        """
346        Sanitize text using the forbidden-terms list.
347
348        Args:
349            input: Input text or list of strings to sanitize.
350            dropStrategy: `word` to remove only incriminating words,
351                `sentence` to drop any sentence containing an incriminating word.
352            extendTerms: Optional list of additional forbidden terms.
353        """
354        txt = self.get(input)
355        terms = self._getForbiddenTerms(extendTerms)
356
357        if dropStrategy not in ["word", "sentence"]:
358            raise ValueError(
359                "dropStrategy must be either 'word' or 'sentence', got "
360                + f"{dropStrategy}"
361            )
362
363        def _containsForbidden(item: str) -> bool:
364            itemLower = item.lower()
365            return any(term in itemLower for term in terms)
366
367        def _normalizeWhitespace(value: str) -> str:
368            replacements = [
369                (r"[ \t]{2,}", " "),
370                (r"\n{3,}", "\n\n"),
371                (r"\s+([.,;:!?])", r"\1"),
372                (r"([\(\[\{¡¿])\s+", r"\1"),
373                (r"\s+([\)\]\}])", r"\1"),
374                (r"([a-z]+)([.?!])([A-Z][a-z]+\b)", r"\1\2 \3"),
375            ]
376            for pattern, replacement in replacements:
377                value = re.sub(pattern, replacement, value)
378            return value.strip()
379
380        def _sanitizeString(value: str) -> str:
381            sentences = content.splitStringToSentences(value)
382
383            if dropStrategy == "sentence":
384                sentences = [s for s in sentences if not _containsForbidden(s)]
385            else:
386                cleanedSentences = []
387                for sentence in sentences:
388                    words = sentence.split(" ")
389                    words = [
390                        word for word in words if word and not _containsForbidden(word)
391                    ]
392                    if words:
393                        cleanedSentences.append(" ".join(words))
394                sentences = cleanedSentences
395
396            cleanValue = " ".join(sentences)
397            return _normalizeWhitespace(cleanValue)
398
399        if isinstance(txt, list):
400            if dropStrategy == "sentence":
401                output = [
402                    _normalizeWhitespace(item)
403                    for item in txt
404                    if not _containsForbidden(item) and _normalizeWhitespace(item)
405                ]
406            else:
407                output = []
408                for item in txt:
409                    cleanItem = _sanitizeString(item)
410                    if cleanItem:
411                        output.append(cleanItem)
412            self.text = output
413        else:
414            output = _sanitizeString(txt)
415            self.text = output.strip()
416
417        return self.text if input is not None else self

Sanitize text using the forbidden-terms list.

Arguments:
  • input: Input text or list of strings to sanitize.
  • dropStrategy: word to remove only incriminating words, sentence to drop any sentence containing an incriminating word.
  • extendTerms: Optional list of additional forbidden terms.