classes.c34_text_cleaner
1import re 2from typing import Literal, Union 3from loguru import logger 4from icecream import ic 5 6from lib import content 7 8 9class KTextCleaner: 10 FORBIDDEN_TERMS_PATH = "/Users/christianjansky/Library/CloudStorage/Dropbox/KOMETA-Draw/01 Content/forbidden-any.txt" 11 12 def __init__(self, text: str | list[str] = None): 13 """Initialize and optionally set text to be cleaned (will return self for chaining).""" 14 self.text = text 15 16 def _getForbiddenTerms(self, extendTerms: list[str] = None) -> list[str]: 17 """Load forbidden terms from default file and optionally extend the list.""" 18 with open(self.FORBIDDEN_TERMS_PATH, encoding="utf-8") as f: 19 terms = f.read().splitlines() 20 21 if extendTerms: 22 terms.extend(extendTerms) 23 24 cleanTerms = [] 25 for term in terms: 26 cleaned = term.strip().lower() 27 if cleaned and cleaned not in cleanTerms: 28 cleanTerms.append(cleaned) 29 30 return cleanTerms 31 32 def get(self, newText: str | list[str] = None) -> str | list[str]: 33 """Return the current text or set a new text.""" 34 if newText is not None: 35 self.text = newText 36 if self.text is None: 37 raise ValueError("No text available for cleaning.") 38 return self.text 39 40 def cleanWikipedia(self, input: str = None) -> Union[str, "KTextCleaner"]: 41 """Clean Wikipedia-specific markup from text.""" 42 txt = self.get(input) 43 44 replacements = [ 45 # Add space when citation is directly followed by lowercase letter 46 (r"(\[\d+(?::?\d+(?:[-–]\d+)?)?\])([a-z])", r"\1 \2"), 47 (r"(\[(?:[Nn]ote|[Nn]?[Bb]|n)\s+\d+\])([a-z])", r"\1 \2"), 48 (r"(\[[a-z]\])([a-z])", r"\1 \2"), 49 # Replace section headers like == Header == with just "Header." 50 (r"^={2,}\s*(.*?)\s*={2,}$", r"\1."), 51 ] 52 for pattern, replacement in replacements: 53 txt = re.sub( 54 pattern, replacement, txt, flags=re.MULTILINE 55 ) # MULTILINE for ^ $ to work line-wise 56 57 removals = [ 58 # Remove citation references preceeded by non-whitespace: [1][2], [3]:15 59 r"(?<=\S)\[\d+\](?::?\d+(?:[-–]\d+)?)?", 60 # Remove note references: [note 1], [n 1], [nb 1], [NB 1], [Note 1] 61 r"\[(?:[Nn]ote|[Nn]?[Bb]|n)\s+\d+\]", 62 # Remove single letter notes: [a], [b], etc. 63 r"(?<=\S)\[[a-z]\]", 64 # Remove editorial tags 65 r"(\{{2}|\[)[Cc]itation( needed)?(\}{2}|\])", 66 r"(\{{2}|\[)[Cc]larification( needed)?(\}{2}|\])", 67 r"(\{{2}|\[)[Vv]erification( needed)?(\}{2}|\])", 68 r"(\{{2}|\[)[Cc]ite [Bb]ook(\}{2}|\])", 69 # Remove IPA pronunciations only (avoid removing regular word/word slashes) 70 # Also consumes optional surrounding punctuation spacing (e.g. "; /ræm/"). 71 r"(?:\s*[;,]\s*)?/(?=[^/\n]{1,80}/)(?=[^/\n]*[ˈˌːɪʊəɛæɑɔʃʒθðŋɡɹɾʔ])[^/\n]{1,80}/(?:\s*[,;])?", 72 ] 73 for removal in removals: 74 txt = re.sub(rf"{removal}", "", txt) 75 76 # Minimal cleanup for whitespace artifacts introduced by removals above. 77 cleanupReplacements = [ 78 # Prevent leading whitespace right after opening punctuation, e.g. "( born" 79 (r"([\(\[\{])\s+", r"\1"), 80 # Remove whitespace before punctuation introduced by token deletion 81 (r"\s+([,.;:!?])", r"\1"), 82 # Collapse repeated horizontal spaces introduced by removals 83 (r"[ \t]{2,}", " "), 84 ] 85 for pattern, replacement in cleanupReplacements: 86 txt = re.sub(pattern, replacement, txt) 87 self.text = txt.strip() 88 # ! input = '' is falsy and would return self instead of cleaned text, so check explicitly for None 89 # ? May return empty string '' 90 return self.text if input is not None else self 91 92 def removeLaTeX(self, input: str = None) -> Union[str, "KTextCleaner"]: 93 """Remove LaTeX math markup and remnants while preserving prose.""" 94 txt = self.get(input) 95 96 def _dropStyledMathBlocks(value: str) -> str: 97 # Remove complete style wrappers such as {\displaystyle ...} / {\textstyle ...} 98 stylePattern = re.compile(r"\{\\(?:display|text)style\b") 99 100 while True: 101 match = stylePattern.search(value) 102 if not match: 103 break 104 105 start = match.start() 106 i = match.end() 107 depth = 1 108 109 while i < len(value) and depth > 0: 110 char = value[i] 111 if char == "{": 112 depth += 1 113 elif char == "}": 114 depth -= 1 115 i += 1 116 117 if depth == 0: 118 value = value[:start] + " " + value[i:] 119 else: 120 # Truncated/malformed block: remove to nearest sentence break. 121 sentenceBreak = re.search(r"[.!?\n]", value[start:]) 122 if sentenceBreak: 123 end = start + sentenceBreak.start() 124 value = value[:start] + " " + value[end:] 125 else: 126 value = value[:start] 127 break 128 129 return value 130 131 def _dropLatexCommandRuns(value: str) -> str: 132 # Remove command groups with braced arguments (including nested via iteration). 133 previous = None 134 while previous != value: 135 previous = value 136 value = re.sub(r"\\[A-Za-z]+\*?\s*\{[^{}]*\}", " ", value) 137 138 # Remove heavy command runs likely to be formula payloads. 139 commandRun = re.compile( 140 r"(?:\\(?:frac|sqrt|text|mathrm|mathbf|hat|vec|cdot|times|int|sum|prod|partial|nabla|left|right|lVert|rVert|alpha|beta|gamma|delta|epsilon|theta|lambda|mu|nu|pi|rho|sigma|tau|phi|psi|omega|Delta|Omega|to|propto|leq|geq|neq|approx|pm|mp|infty|sin|cos|tan|log|ln)\b" 141 r"|[_^]\{[^{}]*\}" 142 r"|[_^][A-Za-z0-9])" 143 r"(?:[\s{}()\[\],.=:+\-*/<>|]|\\[A-Za-z]+|\d)*" 144 ) 145 value = commandRun.sub(" ", value) 146 147 # Remove any leftover escaped commands. 148 value = re.sub(r"\\[A-Za-z]+\*?", " ", value) 149 return value 150 151 def _dropFormulaLikeBraces(value: str) -> str: 152 # Remove brace blocks with formula-like symbols/structure. 153 braceFormula = re.compile(r"\{[^{}]*(?:[_^=]|\\|\d\s*[+\-*/=])[^{}]*\}") 154 155 previous = None 156 while previous != value: 157 previous = value 158 value = braceFormula.sub(" ", value) 159 160 return value 161 162 txt = _dropStyledMathBlocks(txt) 163 txt = _dropLatexCommandRuns(txt) 164 txt = _dropFormulaLikeBraces(txt) 165 166 # Remove obvious punctuation remnants from stripped math chunks. 167 txt = re.sub(r"\(\s*[;,:\-–—]*\s*\)", "", txt) 168 txt = re.sub(r"\[\s*[;,:\-–—]*\s*\]", "", txt) 169 txt = re.sub(r"\{\s*[;,:\-–—]*\s*\}", "", txt) 170 txt = re.sub(r"[{}]", " ", txt) 171 txt = re.sub(r"\s+([.,;:!?])", r"\1", txt) 172 txt = re.sub(r"([.,;:!?]){2,}", r"\1", txt) 173 txt = re.sub(r"[ \t]*\n[ \t]*", "\n", txt) 174 txt = re.sub(r"[ \t]{2,}", " ", txt) 175 txt = re.sub(r"\n{3,}", "\n\n", txt) 176 177 self.text = txt.strip() 178 return self.text if input is not None else self 179 180 def cleanPunctuation(self, input: str = None) -> Union[str, "KTextCleaner"]: 181 """Remove empty parentheses, brackets, and braces; condense multiple punctuation marks.""" 182 txt = self.get(input) 183 184 replacements = [ 185 # Remove empty parentheses, brackets, and braces 186 (r"[\(\[\{]\s*[\)\]\}]", ""), 187 # Replace multiple punctuation marks with a single one 188 (r"([.,;:!?]){2,}", r"\1"), 189 ] 190 for pattern, replacement in replacements: 191 txt = re.sub(pattern, replacement, txt) 192 193 self.text = txt.strip() 194 return self.text if input is not None else self 195 196 def _smartDoubleQuotes(self, txt: str) -> str: 197 """Convert dumb double quotes to smart quotes with basic nesting support.""" 198 opening_context = set(" \t\n\r([{-–—/\\“‘«") 199 result = [] 200 length = len(txt) 201 202 for i, char in enumerate(txt): 203 if char != '"': 204 result.append(char) 205 continue 206 207 prev_char = txt[i - 1] if i > 0 else "" 208 next_char = txt[i + 1] if i + 1 < length else "" 209 210 # Keep measurement marks like 11" unchanged. 211 if prev_char.isdigit(): 212 result.append(char) 213 continue 214 215 is_opening = ( 216 i == 0 217 or prev_char in opening_context 218 or (prev_char in ":;,.!?" and next_char and next_char.isalpha()) 219 ) 220 result.append("“" if is_opening else "”") 221 222 return "".join(result) 223 224 def improvePunctuation(self, input: str = None) -> Union[str, "KTextCleaner"]: 225 """Convert dumb quotes to smart quotes.""" 226 txt = self.get(input) 227 228 replacements = [ 229 # Apostrophes between letters (e.g., Wet'n'Wild, O'Neill) 230 (r"(?<=[A-Za-z])'(?=[A-Za-z])", "’"), 231 # Apostrophes in contractions and possessives 232 (r"\b([A-Za-z]+)'([A-Za-z]*)\b", r"\1’\2"), 233 # Single quotes (not measurements like 5', not after digits) for quotes around words (at least 2 characters to avoid contractions like ’n’) 234 (r"(?<!\w)(?<!\d)'([^']{2,}?)'(?!\d)", r"‘\1’"), 235 # Single quote used as apostrophe for s (e.g., Chris') 236 (r"([A-Za-z])'(\s)\b", r"\1’\2"), 237 # Replace hyphen with en-dash for number ranges (years in parens, dates, etc - not math) 238 (r"(\d{4})\s*-\s*(\d{4})", r"\1–\2"), 239 # Abbreviate shortened year NOT preceded by digit with apostrophe, e.g., '95 or '60s 240 (r"(?<!\d)'(\d{2})(s?)\b", r"’\1\2"), 241 # Add apostrophe for word contractions: 'tis -> ’tis 242 (r"(?<!\w)'([A-Za-z]+)\b", r"’\1"), 243 # Convert double hyphens to em dash 244 (r"--+", "—"), 245 # Add em dash between words with spaces around 246 (r"(\w\s+)-(\s+\w)", r"\1—\2"), 247 # Add ellipsis character for three dots with a space before and after 248 (r"(\w)\s*\.{3}", r"\1 …"), 249 # Dedupe repeating marks (e.g., "!!" -> "!", "???" -> "?") 250 (r"([.,;:!?–—]){2,}", r"\1"), 251 # Remove trailing separator before closing bracket, e.g. (RAM;) -> (RAM) 252 (r"([A-Za-z0-9])\s*[;,:]\s*([\)\]\}])", r"\1\2"), 253 ] 254 for pattern, replacement in replacements: 255 txt = re.sub(pattern, replacement, txt) 256 257 txt = self._smartDoubleQuotes(txt) 258 259 self.text = txt.strip() 260 return self.text if input is not None else self 261 262 def removeOutsidePunctuation(self, input: str = None) -> Union[str, "KTextCleaner"]: 263 """Remove punctuation characters at the start or end of the text.""" 264 txt = self.get(input) 265 266 punctuationToRemove = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~¡¿“”‘’«»""" 267 txt = re.sub(rf"^[\s{punctuationToRemove}]+", "", txt) # Leading punctuation 268 txt = re.sub(rf"[\s{punctuationToRemove}]+$", "", txt) # Trailing punctuation 269 270 self.text = txt.strip() 271 return self.text if input is not None else self 272 273 def removeOrphanedPunctuation( 274 self, input: str = None 275 ) -> Union[str, "KTextCleaner"]: 276 """Remove punctuation characters without relevant counterpart.""" 277 txt = self.get(input) 278 279 # Stack-based approach for proper matching 280 pairs = {"(": ")", "[": "]", "{": "}", "“": "”", "‘": "’", "<": ">", "«": "»"} 281 stack = [] 282 to_remove = set() 283 284 # Find positions of unmatched brackets 285 for i, char in enumerate(txt): 286 if char in pairs: 287 stack.append((i, char)) 288 elif char in pairs.values(): 289 if stack and pairs[stack[-1][1]] == char: 290 stack.pop() 291 else: 292 to_remove.add(i) # Unmatched closing bracket 293 294 # Remaining in stack are unmatched opening brackets 295 to_remove.update(pos for pos, _ in stack) 296 297 # Build result without unmatched brackets 298 txt = "".join(char for i, char in enumerate(txt) if i not in to_remove) 299 300 txt = txt.lstrip( 301 ",.:;!?" 302 ) # Also remove common leading punctuation if it becomes leading after cleanup 303 self.text = txt.strip() 304 return self.text if input is not None else self 305 306 def cleanWhitespace(self, input: str = None) -> Union[str, "KTextCleaner"]: 307 """Clean up whitespace around punctuation and condense multiple spaces.""" 308 txt = self.get(input) 309 310 replacements = [ 311 # Replace multiple spaces with a single space 312 (r"[ \t]{2,}", " "), 313 # Replace multiple line-breaks with a single line-break 314 (r"\n{3,}", "\n\n"), 315 # Remove space before punctuation 316 (r"\s+([.,;:!?])", r"\1"), 317 # Remove space after opening punctuation 318 (r"([\(\[\{¡¿])\s+", r"\1"), 319 # Remove space before closing punctuation 320 (r"\s+([\)\]\}])", r"\1"), 321 # Ensure space after sentence-ending punctuation if followed by capital letter 322 (r"([a-z]+)([.?!])([A-Z][a-z]+\b)", r"\1\2 \3"), 323 ] 324 for pattern, replacement in replacements: 325 txt = re.sub(pattern, replacement, txt) 326 327 self.text = txt.strip() 328 return self.text if input is not None else self 329 330 def cleanExtra(self, input: str = None) -> Union[str, "KTextCleaner"]: 331 # Remove unwanted characters (example: non-ASCII) 332 txt = self.get(input) 333 txt = re.sub(r"[^\x00-\x7F]+", "", txt) 334 335 self.text = txt.strip() 336 return self.text if input is not None else self 337 338 def sanitizeForbidden( 339 self, 340 input: str | list[str] = None, 341 dropStrategy: Literal["word", "sentence"] = "word", 342 extendTerms: list[str] = None, 343 ) -> Union[str, list[str], "KTextCleaner"]: 344 """ 345 Sanitize text using the forbidden-terms list. 346 347 Args: 348 input: Input text or list of strings to sanitize. 349 dropStrategy: `word` to remove only incriminating words, 350 `sentence` to drop any sentence containing an incriminating word. 351 extendTerms: Optional list of additional forbidden terms. 352 """ 353 txt = self.get(input) 354 terms = self._getForbiddenTerms(extendTerms) 355 356 if dropStrategy not in ["word", "sentence"]: 357 raise ValueError( 358 "dropStrategy must be either 'word' or 'sentence', got " 359 + f"{dropStrategy}" 360 ) 361 362 def _containsForbidden(item: str) -> bool: 363 itemLower = item.lower() 364 return any(term in itemLower for term in terms) 365 366 def _normalizeWhitespace(value: str) -> str: 367 replacements = [ 368 (r"[ \t]{2,}", " "), 369 (r"\n{3,}", "\n\n"), 370 (r"\s+([.,;:!?])", r"\1"), 371 (r"([\(\[\{¡¿])\s+", r"\1"), 372 (r"\s+([\)\]\}])", r"\1"), 373 (r"([a-z]+)([.?!])([A-Z][a-z]+\b)", r"\1\2 \3"), 374 ] 375 for pattern, replacement in replacements: 376 value = re.sub(pattern, replacement, value) 377 return value.strip() 378 379 def _sanitizeString(value: str) -> str: 380 sentences = content.splitStringToSentences(value) 381 382 if dropStrategy == "sentence": 383 sentences = [s for s in sentences if not _containsForbidden(s)] 384 else: 385 cleanedSentences = [] 386 for sentence in sentences: 387 words = sentence.split(" ") 388 words = [ 389 word for word in words if word and not _containsForbidden(word) 390 ] 391 if words: 392 cleanedSentences.append(" ".join(words)) 393 sentences = cleanedSentences 394 395 cleanValue = " ".join(sentences) 396 return _normalizeWhitespace(cleanValue) 397 398 if isinstance(txt, list): 399 if dropStrategy == "sentence": 400 output = [ 401 _normalizeWhitespace(item) 402 for item in txt 403 if not _containsForbidden(item) and _normalizeWhitespace(item) 404 ] 405 else: 406 output = [] 407 for item in txt: 408 cleanItem = _sanitizeString(item) 409 if cleanItem: 410 output.append(cleanItem) 411 self.text = output 412 else: 413 output = _sanitizeString(txt) 414 self.text = output.strip() 415 416 return self.text if input is not None else self
class
KTextCleaner:
10class KTextCleaner: 11 FORBIDDEN_TERMS_PATH = "/Users/christianjansky/Library/CloudStorage/Dropbox/KOMETA-Draw/01 Content/forbidden-any.txt" 12 13 def __init__(self, text: str | list[str] = None): 14 """Initialize and optionally set text to be cleaned (will return self for chaining).""" 15 self.text = text 16 17 def _getForbiddenTerms(self, extendTerms: list[str] = None) -> list[str]: 18 """Load forbidden terms from default file and optionally extend the list.""" 19 with open(self.FORBIDDEN_TERMS_PATH, encoding="utf-8") as f: 20 terms = f.read().splitlines() 21 22 if extendTerms: 23 terms.extend(extendTerms) 24 25 cleanTerms = [] 26 for term in terms: 27 cleaned = term.strip().lower() 28 if cleaned and cleaned not in cleanTerms: 29 cleanTerms.append(cleaned) 30 31 return cleanTerms 32 33 def get(self, newText: str | list[str] = None) -> str | list[str]: 34 """Return the current text or set a new text.""" 35 if newText is not None: 36 self.text = newText 37 if self.text is None: 38 raise ValueError("No text available for cleaning.") 39 return self.text 40 41 def cleanWikipedia(self, input: str = None) -> Union[str, "KTextCleaner"]: 42 """Clean Wikipedia-specific markup from text.""" 43 txt = self.get(input) 44 45 replacements = [ 46 # Add space when citation is directly followed by lowercase letter 47 (r"(\[\d+(?::?\d+(?:[-–]\d+)?)?\])([a-z])", r"\1 \2"), 48 (r"(\[(?:[Nn]ote|[Nn]?[Bb]|n)\s+\d+\])([a-z])", r"\1 \2"), 49 (r"(\[[a-z]\])([a-z])", r"\1 \2"), 50 # Replace section headers like == Header == with just "Header." 51 (r"^={2,}\s*(.*?)\s*={2,}$", r"\1."), 52 ] 53 for pattern, replacement in replacements: 54 txt = re.sub( 55 pattern, replacement, txt, flags=re.MULTILINE 56 ) # MULTILINE for ^ $ to work line-wise 57 58 removals = [ 59 # Remove citation references preceeded by non-whitespace: [1][2], [3]:15 60 r"(?<=\S)\[\d+\](?::?\d+(?:[-–]\d+)?)?", 61 # Remove note references: [note 1], [n 1], [nb 1], [NB 1], [Note 1] 62 r"\[(?:[Nn]ote|[Nn]?[Bb]|n)\s+\d+\]", 63 # Remove single letter notes: [a], [b], etc. 64 r"(?<=\S)\[[a-z]\]", 65 # Remove editorial tags 66 r"(\{{2}|\[)[Cc]itation( needed)?(\}{2}|\])", 67 r"(\{{2}|\[)[Cc]larification( needed)?(\}{2}|\])", 68 r"(\{{2}|\[)[Vv]erification( needed)?(\}{2}|\])", 69 r"(\{{2}|\[)[Cc]ite [Bb]ook(\}{2}|\])", 70 # Remove IPA pronunciations only (avoid removing regular word/word slashes) 71 # Also consumes optional surrounding punctuation spacing (e.g. "; /ræm/"). 72 r"(?:\s*[;,]\s*)?/(?=[^/\n]{1,80}/)(?=[^/\n]*[ˈˌːɪʊəɛæɑɔʃʒθðŋɡɹɾʔ])[^/\n]{1,80}/(?:\s*[,;])?", 73 ] 74 for removal in removals: 75 txt = re.sub(rf"{removal}", "", txt) 76 77 # Minimal cleanup for whitespace artifacts introduced by removals above. 78 cleanupReplacements = [ 79 # Prevent leading whitespace right after opening punctuation, e.g. "( born" 80 (r"([\(\[\{])\s+", r"\1"), 81 # Remove whitespace before punctuation introduced by token deletion 82 (r"\s+([,.;:!?])", r"\1"), 83 # Collapse repeated horizontal spaces introduced by removals 84 (r"[ \t]{2,}", " "), 85 ] 86 for pattern, replacement in cleanupReplacements: 87 txt = re.sub(pattern, replacement, txt) 88 self.text = txt.strip() 89 # ! input = '' is falsy and would return self instead of cleaned text, so check explicitly for None 90 # ? May return empty string '' 91 return self.text if input is not None else self 92 93 def removeLaTeX(self, input: str = None) -> Union[str, "KTextCleaner"]: 94 """Remove LaTeX math markup and remnants while preserving prose.""" 95 txt = self.get(input) 96 97 def _dropStyledMathBlocks(value: str) -> str: 98 # Remove complete style wrappers such as {\displaystyle ...} / {\textstyle ...} 99 stylePattern = re.compile(r"\{\\(?:display|text)style\b") 100 101 while True: 102 match = stylePattern.search(value) 103 if not match: 104 break 105 106 start = match.start() 107 i = match.end() 108 depth = 1 109 110 while i < len(value) and depth > 0: 111 char = value[i] 112 if char == "{": 113 depth += 1 114 elif char == "}": 115 depth -= 1 116 i += 1 117 118 if depth == 0: 119 value = value[:start] + " " + value[i:] 120 else: 121 # Truncated/malformed block: remove to nearest sentence break. 122 sentenceBreak = re.search(r"[.!?\n]", value[start:]) 123 if sentenceBreak: 124 end = start + sentenceBreak.start() 125 value = value[:start] + " " + value[end:] 126 else: 127 value = value[:start] 128 break 129 130 return value 131 132 def _dropLatexCommandRuns(value: str) -> str: 133 # Remove command groups with braced arguments (including nested via iteration). 134 previous = None 135 while previous != value: 136 previous = value 137 value = re.sub(r"\\[A-Za-z]+\*?\s*\{[^{}]*\}", " ", value) 138 139 # Remove heavy command runs likely to be formula payloads. 140 commandRun = re.compile( 141 r"(?:\\(?:frac|sqrt|text|mathrm|mathbf|hat|vec|cdot|times|int|sum|prod|partial|nabla|left|right|lVert|rVert|alpha|beta|gamma|delta|epsilon|theta|lambda|mu|nu|pi|rho|sigma|tau|phi|psi|omega|Delta|Omega|to|propto|leq|geq|neq|approx|pm|mp|infty|sin|cos|tan|log|ln)\b" 142 r"|[_^]\{[^{}]*\}" 143 r"|[_^][A-Za-z0-9])" 144 r"(?:[\s{}()\[\],.=:+\-*/<>|]|\\[A-Za-z]+|\d)*" 145 ) 146 value = commandRun.sub(" ", value) 147 148 # Remove any leftover escaped commands. 149 value = re.sub(r"\\[A-Za-z]+\*?", " ", value) 150 return value 151 152 def _dropFormulaLikeBraces(value: str) -> str: 153 # Remove brace blocks with formula-like symbols/structure. 154 braceFormula = re.compile(r"\{[^{}]*(?:[_^=]|\\|\d\s*[+\-*/=])[^{}]*\}") 155 156 previous = None 157 while previous != value: 158 previous = value 159 value = braceFormula.sub(" ", value) 160 161 return value 162 163 txt = _dropStyledMathBlocks(txt) 164 txt = _dropLatexCommandRuns(txt) 165 txt = _dropFormulaLikeBraces(txt) 166 167 # Remove obvious punctuation remnants from stripped math chunks. 168 txt = re.sub(r"\(\s*[;,:\-–—]*\s*\)", "", txt) 169 txt = re.sub(r"\[\s*[;,:\-–—]*\s*\]", "", txt) 170 txt = re.sub(r"\{\s*[;,:\-–—]*\s*\}", "", txt) 171 txt = re.sub(r"[{}]", " ", txt) 172 txt = re.sub(r"\s+([.,;:!?])", r"\1", txt) 173 txt = re.sub(r"([.,;:!?]){2,}", r"\1", txt) 174 txt = re.sub(r"[ \t]*\n[ \t]*", "\n", txt) 175 txt = re.sub(r"[ \t]{2,}", " ", txt) 176 txt = re.sub(r"\n{3,}", "\n\n", txt) 177 178 self.text = txt.strip() 179 return self.text if input is not None else self 180 181 def cleanPunctuation(self, input: str = None) -> Union[str, "KTextCleaner"]: 182 """Remove empty parentheses, brackets, and braces; condense multiple punctuation marks.""" 183 txt = self.get(input) 184 185 replacements = [ 186 # Remove empty parentheses, brackets, and braces 187 (r"[\(\[\{]\s*[\)\]\}]", ""), 188 # Replace multiple punctuation marks with a single one 189 (r"([.,;:!?]){2,}", r"\1"), 190 ] 191 for pattern, replacement in replacements: 192 txt = re.sub(pattern, replacement, txt) 193 194 self.text = txt.strip() 195 return self.text if input is not None else self 196 197 def _smartDoubleQuotes(self, txt: str) -> str: 198 """Convert dumb double quotes to smart quotes with basic nesting support.""" 199 opening_context = set(" \t\n\r([{-–—/\\“‘«") 200 result = [] 201 length = len(txt) 202 203 for i, char in enumerate(txt): 204 if char != '"': 205 result.append(char) 206 continue 207 208 prev_char = txt[i - 1] if i > 0 else "" 209 next_char = txt[i + 1] if i + 1 < length else "" 210 211 # Keep measurement marks like 11" unchanged. 212 if prev_char.isdigit(): 213 result.append(char) 214 continue 215 216 is_opening = ( 217 i == 0 218 or prev_char in opening_context 219 or (prev_char in ":;,.!?" and next_char and next_char.isalpha()) 220 ) 221 result.append("“" if is_opening else "”") 222 223 return "".join(result) 224 225 def improvePunctuation(self, input: str = None) -> Union[str, "KTextCleaner"]: 226 """Convert dumb quotes to smart quotes.""" 227 txt = self.get(input) 228 229 replacements = [ 230 # Apostrophes between letters (e.g., Wet'n'Wild, O'Neill) 231 (r"(?<=[A-Za-z])'(?=[A-Za-z])", "’"), 232 # Apostrophes in contractions and possessives 233 (r"\b([A-Za-z]+)'([A-Za-z]*)\b", r"\1’\2"), 234 # Single quotes (not measurements like 5', not after digits) for quotes around words (at least 2 characters to avoid contractions like ’n’) 235 (r"(?<!\w)(?<!\d)'([^']{2,}?)'(?!\d)", r"‘\1’"), 236 # Single quote used as apostrophe for s (e.g., Chris') 237 (r"([A-Za-z])'(\s)\b", r"\1’\2"), 238 # Replace hyphen with en-dash for number ranges (years in parens, dates, etc - not math) 239 (r"(\d{4})\s*-\s*(\d{4})", r"\1–\2"), 240 # Abbreviate shortened year NOT preceded by digit with apostrophe, e.g., '95 or '60s 241 (r"(?<!\d)'(\d{2})(s?)\b", r"’\1\2"), 242 # Add apostrophe for word contractions: 'tis -> ’tis 243 (r"(?<!\w)'([A-Za-z]+)\b", r"’\1"), 244 # Convert double hyphens to em dash 245 (r"--+", "—"), 246 # Add em dash between words with spaces around 247 (r"(\w\s+)-(\s+\w)", r"\1—\2"), 248 # Add ellipsis character for three dots with a space before and after 249 (r"(\w)\s*\.{3}", r"\1 …"), 250 # Dedupe repeating marks (e.g., "!!" -> "!", "???" -> "?") 251 (r"([.,;:!?–—]){2,}", r"\1"), 252 # Remove trailing separator before closing bracket, e.g. (RAM;) -> (RAM) 253 (r"([A-Za-z0-9])\s*[;,:]\s*([\)\]\}])", r"\1\2"), 254 ] 255 for pattern, replacement in replacements: 256 txt = re.sub(pattern, replacement, txt) 257 258 txt = self._smartDoubleQuotes(txt) 259 260 self.text = txt.strip() 261 return self.text if input is not None else self 262 263 def removeOutsidePunctuation(self, input: str = None) -> Union[str, "KTextCleaner"]: 264 """Remove punctuation characters at the start or end of the text.""" 265 txt = self.get(input) 266 267 punctuationToRemove = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~¡¿“”‘’«»""" 268 txt = re.sub(rf"^[\s{punctuationToRemove}]+", "", txt) # Leading punctuation 269 txt = re.sub(rf"[\s{punctuationToRemove}]+$", "", txt) # Trailing punctuation 270 271 self.text = txt.strip() 272 return self.text if input is not None else self 273 274 def removeOrphanedPunctuation( 275 self, input: str = None 276 ) -> Union[str, "KTextCleaner"]: 277 """Remove punctuation characters without relevant counterpart.""" 278 txt = self.get(input) 279 280 # Stack-based approach for proper matching 281 pairs = {"(": ")", "[": "]", "{": "}", "“": "”", "‘": "’", "<": ">", "«": "»"} 282 stack = [] 283 to_remove = set() 284 285 # Find positions of unmatched brackets 286 for i, char in enumerate(txt): 287 if char in pairs: 288 stack.append((i, char)) 289 elif char in pairs.values(): 290 if stack and pairs[stack[-1][1]] == char: 291 stack.pop() 292 else: 293 to_remove.add(i) # Unmatched closing bracket 294 295 # Remaining in stack are unmatched opening brackets 296 to_remove.update(pos for pos, _ in stack) 297 298 # Build result without unmatched brackets 299 txt = "".join(char for i, char in enumerate(txt) if i not in to_remove) 300 301 txt = txt.lstrip( 302 ",.:;!?" 303 ) # Also remove common leading punctuation if it becomes leading after cleanup 304 self.text = txt.strip() 305 return self.text if input is not None else self 306 307 def cleanWhitespace(self, input: str = None) -> Union[str, "KTextCleaner"]: 308 """Clean up whitespace around punctuation and condense multiple spaces.""" 309 txt = self.get(input) 310 311 replacements = [ 312 # Replace multiple spaces with a single space 313 (r"[ \t]{2,}", " "), 314 # Replace multiple line-breaks with a single line-break 315 (r"\n{3,}", "\n\n"), 316 # Remove space before punctuation 317 (r"\s+([.,;:!?])", r"\1"), 318 # Remove space after opening punctuation 319 (r"([\(\[\{¡¿])\s+", r"\1"), 320 # Remove space before closing punctuation 321 (r"\s+([\)\]\}])", r"\1"), 322 # Ensure space after sentence-ending punctuation if followed by capital letter 323 (r"([a-z]+)([.?!])([A-Z][a-z]+\b)", r"\1\2 \3"), 324 ] 325 for pattern, replacement in replacements: 326 txt = re.sub(pattern, replacement, txt) 327 328 self.text = txt.strip() 329 return self.text if input is not None else self 330 331 def cleanExtra(self, input: str = None) -> Union[str, "KTextCleaner"]: 332 # Remove unwanted characters (example: non-ASCII) 333 txt = self.get(input) 334 txt = re.sub(r"[^\x00-\x7F]+", "", txt) 335 336 self.text = txt.strip() 337 return self.text if input is not None else self 338 339 def sanitizeForbidden( 340 self, 341 input: str | list[str] = None, 342 dropStrategy: Literal["word", "sentence"] = "word", 343 extendTerms: list[str] = None, 344 ) -> Union[str, list[str], "KTextCleaner"]: 345 """ 346 Sanitize text using the forbidden-terms list. 347 348 Args: 349 input: Input text or list of strings to sanitize. 350 dropStrategy: `word` to remove only incriminating words, 351 `sentence` to drop any sentence containing an incriminating word. 352 extendTerms: Optional list of additional forbidden terms. 353 """ 354 txt = self.get(input) 355 terms = self._getForbiddenTerms(extendTerms) 356 357 if dropStrategy not in ["word", "sentence"]: 358 raise ValueError( 359 "dropStrategy must be either 'word' or 'sentence', got " 360 + f"{dropStrategy}" 361 ) 362 363 def _containsForbidden(item: str) -> bool: 364 itemLower = item.lower() 365 return any(term in itemLower for term in terms) 366 367 def _normalizeWhitespace(value: str) -> str: 368 replacements = [ 369 (r"[ \t]{2,}", " "), 370 (r"\n{3,}", "\n\n"), 371 (r"\s+([.,;:!?])", r"\1"), 372 (r"([\(\[\{¡¿])\s+", r"\1"), 373 (r"\s+([\)\]\}])", r"\1"), 374 (r"([a-z]+)([.?!])([A-Z][a-z]+\b)", r"\1\2 \3"), 375 ] 376 for pattern, replacement in replacements: 377 value = re.sub(pattern, replacement, value) 378 return value.strip() 379 380 def _sanitizeString(value: str) -> str: 381 sentences = content.splitStringToSentences(value) 382 383 if dropStrategy == "sentence": 384 sentences = [s for s in sentences if not _containsForbidden(s)] 385 else: 386 cleanedSentences = [] 387 for sentence in sentences: 388 words = sentence.split(" ") 389 words = [ 390 word for word in words if word and not _containsForbidden(word) 391 ] 392 if words: 393 cleanedSentences.append(" ".join(words)) 394 sentences = cleanedSentences 395 396 cleanValue = " ".join(sentences) 397 return _normalizeWhitespace(cleanValue) 398 399 if isinstance(txt, list): 400 if dropStrategy == "sentence": 401 output = [ 402 _normalizeWhitespace(item) 403 for item in txt 404 if not _containsForbidden(item) and _normalizeWhitespace(item) 405 ] 406 else: 407 output = [] 408 for item in txt: 409 cleanItem = _sanitizeString(item) 410 if cleanItem: 411 output.append(cleanItem) 412 self.text = output 413 else: 414 output = _sanitizeString(txt) 415 self.text = output.strip() 416 417 return self.text if input is not None else self
KTextCleaner(text: str | list[str] = None)
13 def __init__(self, text: str | list[str] = None): 14 """Initialize and optionally set text to be cleaned (will return self for chaining).""" 15 self.text = text
Initialize and optionally set text to be cleaned (will return self for chaining).
FORBIDDEN_TERMS_PATH =
'/Users/christianjansky/Library/CloudStorage/Dropbox/KOMETA-Draw/01 Content/forbidden-any.txt'
def
get(self, newText: str | list[str] = None) -> str | list[str]:
33 def get(self, newText: str | list[str] = None) -> str | list[str]: 34 """Return the current text or set a new text.""" 35 if newText is not None: 36 self.text = newText 37 if self.text is None: 38 raise ValueError("No text available for cleaning.") 39 return self.text
Return the current text or set a new text.
41 def cleanWikipedia(self, input: str = None) -> Union[str, "KTextCleaner"]: 42 """Clean Wikipedia-specific markup from text.""" 43 txt = self.get(input) 44 45 replacements = [ 46 # Add space when citation is directly followed by lowercase letter 47 (r"(\[\d+(?::?\d+(?:[-–]\d+)?)?\])([a-z])", r"\1 \2"), 48 (r"(\[(?:[Nn]ote|[Nn]?[Bb]|n)\s+\d+\])([a-z])", r"\1 \2"), 49 (r"(\[[a-z]\])([a-z])", r"\1 \2"), 50 # Replace section headers like == Header == with just "Header." 51 (r"^={2,}\s*(.*?)\s*={2,}$", r"\1."), 52 ] 53 for pattern, replacement in replacements: 54 txt = re.sub( 55 pattern, replacement, txt, flags=re.MULTILINE 56 ) # MULTILINE for ^ $ to work line-wise 57 58 removals = [ 59 # Remove citation references preceeded by non-whitespace: [1][2], [3]:15 60 r"(?<=\S)\[\d+\](?::?\d+(?:[-–]\d+)?)?", 61 # Remove note references: [note 1], [n 1], [nb 1], [NB 1], [Note 1] 62 r"\[(?:[Nn]ote|[Nn]?[Bb]|n)\s+\d+\]", 63 # Remove single letter notes: [a], [b], etc. 64 r"(?<=\S)\[[a-z]\]", 65 # Remove editorial tags 66 r"(\{{2}|\[)[Cc]itation( needed)?(\}{2}|\])", 67 r"(\{{2}|\[)[Cc]larification( needed)?(\}{2}|\])", 68 r"(\{{2}|\[)[Vv]erification( needed)?(\}{2}|\])", 69 r"(\{{2}|\[)[Cc]ite [Bb]ook(\}{2}|\])", 70 # Remove IPA pronunciations only (avoid removing regular word/word slashes) 71 # Also consumes optional surrounding punctuation spacing (e.g. "; /ræm/"). 72 r"(?:\s*[;,]\s*)?/(?=[^/\n]{1,80}/)(?=[^/\n]*[ˈˌːɪʊəɛæɑɔʃʒθðŋɡɹɾʔ])[^/\n]{1,80}/(?:\s*[,;])?", 73 ] 74 for removal in removals: 75 txt = re.sub(rf"{removal}", "", txt) 76 77 # Minimal cleanup for whitespace artifacts introduced by removals above. 78 cleanupReplacements = [ 79 # Prevent leading whitespace right after opening punctuation, e.g. "( born" 80 (r"([\(\[\{])\s+", r"\1"), 81 # Remove whitespace before punctuation introduced by token deletion 82 (r"\s+([,.;:!?])", r"\1"), 83 # Collapse repeated horizontal spaces introduced by removals 84 (r"[ \t]{2,}", " "), 85 ] 86 for pattern, replacement in cleanupReplacements: 87 txt = re.sub(pattern, replacement, txt) 88 self.text = txt.strip() 89 # ! input = '' is falsy and would return self instead of cleaned text, so check explicitly for None 90 # ? May return empty string '' 91 return self.text if input is not None else self
Clean Wikipedia-specific markup from text.
93 def removeLaTeX(self, input: str = None) -> Union[str, "KTextCleaner"]: 94 """Remove LaTeX math markup and remnants while preserving prose.""" 95 txt = self.get(input) 96 97 def _dropStyledMathBlocks(value: str) -> str: 98 # Remove complete style wrappers such as {\displaystyle ...} / {\textstyle ...} 99 stylePattern = re.compile(r"\{\\(?:display|text)style\b") 100 101 while True: 102 match = stylePattern.search(value) 103 if not match: 104 break 105 106 start = match.start() 107 i = match.end() 108 depth = 1 109 110 while i < len(value) and depth > 0: 111 char = value[i] 112 if char == "{": 113 depth += 1 114 elif char == "}": 115 depth -= 1 116 i += 1 117 118 if depth == 0: 119 value = value[:start] + " " + value[i:] 120 else: 121 # Truncated/malformed block: remove to nearest sentence break. 122 sentenceBreak = re.search(r"[.!?\n]", value[start:]) 123 if sentenceBreak: 124 end = start + sentenceBreak.start() 125 value = value[:start] + " " + value[end:] 126 else: 127 value = value[:start] 128 break 129 130 return value 131 132 def _dropLatexCommandRuns(value: str) -> str: 133 # Remove command groups with braced arguments (including nested via iteration). 134 previous = None 135 while previous != value: 136 previous = value 137 value = re.sub(r"\\[A-Za-z]+\*?\s*\{[^{}]*\}", " ", value) 138 139 # Remove heavy command runs likely to be formula payloads. 140 commandRun = re.compile( 141 r"(?:\\(?:frac|sqrt|text|mathrm|mathbf|hat|vec|cdot|times|int|sum|prod|partial|nabla|left|right|lVert|rVert|alpha|beta|gamma|delta|epsilon|theta|lambda|mu|nu|pi|rho|sigma|tau|phi|psi|omega|Delta|Omega|to|propto|leq|geq|neq|approx|pm|mp|infty|sin|cos|tan|log|ln)\b" 142 r"|[_^]\{[^{}]*\}" 143 r"|[_^][A-Za-z0-9])" 144 r"(?:[\s{}()\[\],.=:+\-*/<>|]|\\[A-Za-z]+|\d)*" 145 ) 146 value = commandRun.sub(" ", value) 147 148 # Remove any leftover escaped commands. 149 value = re.sub(r"\\[A-Za-z]+\*?", " ", value) 150 return value 151 152 def _dropFormulaLikeBraces(value: str) -> str: 153 # Remove brace blocks with formula-like symbols/structure. 154 braceFormula = re.compile(r"\{[^{}]*(?:[_^=]|\\|\d\s*[+\-*/=])[^{}]*\}") 155 156 previous = None 157 while previous != value: 158 previous = value 159 value = braceFormula.sub(" ", value) 160 161 return value 162 163 txt = _dropStyledMathBlocks(txt) 164 txt = _dropLatexCommandRuns(txt) 165 txt = _dropFormulaLikeBraces(txt) 166 167 # Remove obvious punctuation remnants from stripped math chunks. 168 txt = re.sub(r"\(\s*[;,:\-–—]*\s*\)", "", txt) 169 txt = re.sub(r"\[\s*[;,:\-–—]*\s*\]", "", txt) 170 txt = re.sub(r"\{\s*[;,:\-–—]*\s*\}", "", txt) 171 txt = re.sub(r"[{}]", " ", txt) 172 txt = re.sub(r"\s+([.,;:!?])", r"\1", txt) 173 txt = re.sub(r"([.,;:!?]){2,}", r"\1", txt) 174 txt = re.sub(r"[ \t]*\n[ \t]*", "\n", txt) 175 txt = re.sub(r"[ \t]{2,}", " ", txt) 176 txt = re.sub(r"\n{3,}", "\n\n", txt) 177 178 self.text = txt.strip() 179 return self.text if input is not None else self
Remove LaTeX math markup and remnants while preserving prose.
181 def cleanPunctuation(self, input: str = None) -> Union[str, "KTextCleaner"]: 182 """Remove empty parentheses, brackets, and braces; condense multiple punctuation marks.""" 183 txt = self.get(input) 184 185 replacements = [ 186 # Remove empty parentheses, brackets, and braces 187 (r"[\(\[\{]\s*[\)\]\}]", ""), 188 # Replace multiple punctuation marks with a single one 189 (r"([.,;:!?]){2,}", r"\1"), 190 ] 191 for pattern, replacement in replacements: 192 txt = re.sub(pattern, replacement, txt) 193 194 self.text = txt.strip() 195 return self.text if input is not None else self
Remove empty parentheses, brackets, and braces; condense multiple punctuation marks.
225 def improvePunctuation(self, input: str = None) -> Union[str, "KTextCleaner"]: 226 """Convert dumb quotes to smart quotes.""" 227 txt = self.get(input) 228 229 replacements = [ 230 # Apostrophes between letters (e.g., Wet'n'Wild, O'Neill) 231 (r"(?<=[A-Za-z])'(?=[A-Za-z])", "’"), 232 # Apostrophes in contractions and possessives 233 (r"\b([A-Za-z]+)'([A-Za-z]*)\b", r"\1’\2"), 234 # Single quotes (not measurements like 5', not after digits) for quotes around words (at least 2 characters to avoid contractions like ’n’) 235 (r"(?<!\w)(?<!\d)'([^']{2,}?)'(?!\d)", r"‘\1’"), 236 # Single quote used as apostrophe for s (e.g., Chris') 237 (r"([A-Za-z])'(\s)\b", r"\1’\2"), 238 # Replace hyphen with en-dash for number ranges (years in parens, dates, etc - not math) 239 (r"(\d{4})\s*-\s*(\d{4})", r"\1–\2"), 240 # Abbreviate shortened year NOT preceded by digit with apostrophe, e.g., '95 or '60s 241 (r"(?<!\d)'(\d{2})(s?)\b", r"’\1\2"), 242 # Add apostrophe for word contractions: 'tis -> ’tis 243 (r"(?<!\w)'([A-Za-z]+)\b", r"’\1"), 244 # Convert double hyphens to em dash 245 (r"--+", "—"), 246 # Add em dash between words with spaces around 247 (r"(\w\s+)-(\s+\w)", r"\1—\2"), 248 # Add ellipsis character for three dots with a space before and after 249 (r"(\w)\s*\.{3}", r"\1 …"), 250 # Dedupe repeating marks (e.g., "!!" -> "!", "???" -> "?") 251 (r"([.,;:!?–—]){2,}", r"\1"), 252 # Remove trailing separator before closing bracket, e.g. (RAM;) -> (RAM) 253 (r"([A-Za-z0-9])\s*[;,:]\s*([\)\]\}])", r"\1\2"), 254 ] 255 for pattern, replacement in replacements: 256 txt = re.sub(pattern, replacement, txt) 257 258 txt = self._smartDoubleQuotes(txt) 259 260 self.text = txt.strip() 261 return self.text if input is not None else self
Convert dumb quotes to smart quotes.
263 def removeOutsidePunctuation(self, input: str = None) -> Union[str, "KTextCleaner"]: 264 """Remove punctuation characters at the start or end of the text.""" 265 txt = self.get(input) 266 267 punctuationToRemove = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~¡¿“”‘’«»""" 268 txt = re.sub(rf"^[\s{punctuationToRemove}]+", "", txt) # Leading punctuation 269 txt = re.sub(rf"[\s{punctuationToRemove}]+$", "", txt) # Trailing punctuation 270 271 self.text = txt.strip() 272 return self.text if input is not None else self
Remove punctuation characters at the start or end of the text.
274 def removeOrphanedPunctuation( 275 self, input: str = None 276 ) -> Union[str, "KTextCleaner"]: 277 """Remove punctuation characters without relevant counterpart.""" 278 txt = self.get(input) 279 280 # Stack-based approach for proper matching 281 pairs = {"(": ")", "[": "]", "{": "}", "“": "”", "‘": "’", "<": ">", "«": "»"} 282 stack = [] 283 to_remove = set() 284 285 # Find positions of unmatched brackets 286 for i, char in enumerate(txt): 287 if char in pairs: 288 stack.append((i, char)) 289 elif char in pairs.values(): 290 if stack and pairs[stack[-1][1]] == char: 291 stack.pop() 292 else: 293 to_remove.add(i) # Unmatched closing bracket 294 295 # Remaining in stack are unmatched opening brackets 296 to_remove.update(pos for pos, _ in stack) 297 298 # Build result without unmatched brackets 299 txt = "".join(char for i, char in enumerate(txt) if i not in to_remove) 300 301 txt = txt.lstrip( 302 ",.:;!?" 303 ) # Also remove common leading punctuation if it becomes leading after cleanup 304 self.text = txt.strip() 305 return self.text if input is not None else self
Remove punctuation characters without relevant counterpart.
307 def cleanWhitespace(self, input: str = None) -> Union[str, "KTextCleaner"]: 308 """Clean up whitespace around punctuation and condense multiple spaces.""" 309 txt = self.get(input) 310 311 replacements = [ 312 # Replace multiple spaces with a single space 313 (r"[ \t]{2,}", " "), 314 # Replace multiple line-breaks with a single line-break 315 (r"\n{3,}", "\n\n"), 316 # Remove space before punctuation 317 (r"\s+([.,;:!?])", r"\1"), 318 # Remove space after opening punctuation 319 (r"([\(\[\{¡¿])\s+", r"\1"), 320 # Remove space before closing punctuation 321 (r"\s+([\)\]\}])", r"\1"), 322 # Ensure space after sentence-ending punctuation if followed by capital letter 323 (r"([a-z]+)([.?!])([A-Z][a-z]+\b)", r"\1\2 \3"), 324 ] 325 for pattern, replacement in replacements: 326 txt = re.sub(pattern, replacement, txt) 327 328 self.text = txt.strip() 329 return self.text if input is not None else self
Clean up whitespace around punctuation and condense multiple spaces.
def
sanitizeForbidden( self, input: str | list[str] = None, dropStrategy: Literal['word', 'sentence'] = 'word', extendTerms: list[str] = None) -> Union[str, list[str], KTextCleaner]:
339 def sanitizeForbidden( 340 self, 341 input: str | list[str] = None, 342 dropStrategy: Literal["word", "sentence"] = "word", 343 extendTerms: list[str] = None, 344 ) -> Union[str, list[str], "KTextCleaner"]: 345 """ 346 Sanitize text using the forbidden-terms list. 347 348 Args: 349 input: Input text or list of strings to sanitize. 350 dropStrategy: `word` to remove only incriminating words, 351 `sentence` to drop any sentence containing an incriminating word. 352 extendTerms: Optional list of additional forbidden terms. 353 """ 354 txt = self.get(input) 355 terms = self._getForbiddenTerms(extendTerms) 356 357 if dropStrategy not in ["word", "sentence"]: 358 raise ValueError( 359 "dropStrategy must be either 'word' or 'sentence', got " 360 + f"{dropStrategy}" 361 ) 362 363 def _containsForbidden(item: str) -> bool: 364 itemLower = item.lower() 365 return any(term in itemLower for term in terms) 366 367 def _normalizeWhitespace(value: str) -> str: 368 replacements = [ 369 (r"[ \t]{2,}", " "), 370 (r"\n{3,}", "\n\n"), 371 (r"\s+([.,;:!?])", r"\1"), 372 (r"([\(\[\{¡¿])\s+", r"\1"), 373 (r"\s+([\)\]\}])", r"\1"), 374 (r"([a-z]+)([.?!])([A-Z][a-z]+\b)", r"\1\2 \3"), 375 ] 376 for pattern, replacement in replacements: 377 value = re.sub(pattern, replacement, value) 378 return value.strip() 379 380 def _sanitizeString(value: str) -> str: 381 sentences = content.splitStringToSentences(value) 382 383 if dropStrategy == "sentence": 384 sentences = [s for s in sentences if not _containsForbidden(s)] 385 else: 386 cleanedSentences = [] 387 for sentence in sentences: 388 words = sentence.split(" ") 389 words = [ 390 word for word in words if word and not _containsForbidden(word) 391 ] 392 if words: 393 cleanedSentences.append(" ".join(words)) 394 sentences = cleanedSentences 395 396 cleanValue = " ".join(sentences) 397 return _normalizeWhitespace(cleanValue) 398 399 if isinstance(txt, list): 400 if dropStrategy == "sentence": 401 output = [ 402 _normalizeWhitespace(item) 403 for item in txt 404 if not _containsForbidden(item) and _normalizeWhitespace(item) 405 ] 406 else: 407 output = [] 408 for item in txt: 409 cleanItem = _sanitizeString(item) 410 if cleanItem: 411 output.append(cleanItem) 412 self.text = output 413 else: 414 output = _sanitizeString(txt) 415 self.text = output.strip() 416 417 return self.text if input is not None else self
Sanitize text using the forbidden-terms list.
Arguments:
- input: Input text or list of strings to sanitize.
- dropStrategy:
wordto remove only incriminating words,sentenceto drop any sentence containing an incriminating word. - extendTerms: Optional list of additional forbidden terms.