classes.c30_scraper
1from typing import Literal 2import wikipedia 3import re 4import json 5import hashlib 6import os 7import time 8from concurrent.futures import ThreadPoolExecutor, as_completed 9from loguru import logger 10 11from lib import files, helpers, content 12from .c34_text_cleaner import KTextCleaner 13 14CacheFlushMode = Literal["fetch", "articles", "all"] 15 16 17class KScraper: 18 ScraperMode = Literal["quick", "balanced", "comprehensive"] 19 metadata: dict 20 """Metadata about the last fetchArticles call.""" 21 22 @staticmethod 23 def estimateOptimalSize(article_count: int, mode: ScraperMode = "quick") -> dict: 24 """ 25 Recommends optimal text size parameters based on article count and use case. 26 27 Args: 28 article_count: Number of articles to fetch. 29 mode: Processing mode indicating desired size and depth of content. 30 31 Returns: 32 Dict with recommended 'targetWords', 'maxWords', and 'minWordsPerArticle'. 33 """ 34 modes = { 35 "quick": {"base": 10000, "max": 15000, "min_per": 250}, 36 "balanced": {"base": 20000, "max": 30000, "min_per": 300}, 37 "comprehensive": {"base": 40000, "max": 50000, "min_per": 500}, 38 } 39 40 config = modes.get(mode, modes["quick"]) 41 42 # Adjust target based on article count 43 # More articles = higher target (but capped at max) 44 target = min(config["base"] * (1 + article_count // 5), config["max"]) 45 46 return { 47 "targetWords": target, 48 "maxWords": config["max"], 49 "minWordsPerArticle": config["min_per"], 50 } 51 52 @staticmethod 53 def _findWikiTitle(slug: str) -> str: 54 """ 55 Finds a valid Wikipedia page title or its closest match. 56 57 Args: 58 slug: The Wikipedia page slug. 59 60 Returns: 61 The resolved Wikipedia page title. 62 """ 63 max_retries = 4 64 for attempt in range(max_retries): 65 try: 66 try: 67 # Find exact match 68 page = wikipedia.page(slug, auto_suggest=False) 69 except wikipedia.exceptions.PageError as e: 70 # Correct spelling error 71 suggested = wikipedia.suggest(e.pageid) 72 logger.warning("Correcting {} to {}", e.pageid, suggested) 73 page = wikipedia.page(suggested, auto_suggest=False) 74 except wikipedia.exceptions.DisambiguationError as e: 75 # Pick first possible match if it may refer to multiple pages 76 logger.warning("Possible pages for {}: {}", slug, e.options) 77 page = wikipedia.page(helpers.pickFirst(e.options), auto_suggest=False) 78 except (json.JSONDecodeError, ValueError) as e: 79 if attempt < max_retries - 1: 80 wait = 2**attempt 81 logger.warning( 82 "Empty/invalid response resolving title for {} (attempt {}/{}), retrying in {}s: {}", 83 slug, 84 attempt + 1, 85 max_retries, 86 wait, 87 e, 88 ) 89 time.sleep(wait) 90 continue 91 raise 92 break 93 94 return page.title 95 96 @staticmethod 97 def _splitSentences(text: str) -> list[str]: 98 """ 99 Split text into sentences (simple implementation). 100 101 Args: 102 text: Input text to split. 103 104 Returns: 105 List of sentences. 106 """ 107 # Simple sentence splitting on common terminators 108 sentences = re.split(r"(?<=[.!?])\s+", text) 109 return [s.strip() for s in sentences if s.strip()] 110 111 def __init__( 112 self, 113 folder="01 Content/articles", 114 language="en", 115 mode: ScraperMode = "quick", 116 maxWorkers: int = 4, 117 ) -> None: 118 """ 119 Initializes the KScraper with a target folder, language, and processing mode. 120 121 Args: 122 folder: Folder partial where articles will be saved. 123 language: The language for Wikipedia articles. 124 mode: Processing mode 125 - `quick` (10-15k words) 126 - `balanced` (20-30k words) 127 - `comprehensive` (40-50k words) 128 maxWorkers: Number of concurrent fetch workers. 129 """ 130 # Prepend absolute path in case specimen not run from 03 DrawBot/ 131 self.folder = files.createFolder( 132 f"/Users/christianjansky/Library/CloudStorage/Dropbox/KOMETA-Draw/{folder.rstrip('/')}-{language}/" 133 ) 134 """Full path to the file system folder where articles will be saved.""" 135 self.mode = mode 136 """Processing mode for size estimation.""" 137 self.language = language 138 """Language code used for Wikipedia queries and cache keys.""" 139 self.maxWorkers = max(1, int(maxWorkers)) 140 """Maximum number of concurrent workers used by fetchArticles.""" 141 self.articles = {} 142 """Dict storing fetched articles as {slug: content}.""" 143 self._titleCache = {} 144 """Per-instance cache for resolved page titles by requested slug.""" 145 wikipedia.set_lang(language) 146 147 def _fetchCachePath( 148 self, 149 slugs: list[str], 150 targetWords: int | None, 151 maxWords: int | None, 152 minWordsPerArticle: int | None, 153 ) -> str: 154 """Return deterministic path for a fetch result cache file.""" 155 payload = { 156 "language": self.language, 157 "mode": self.mode, 158 "slugs": sorted(slugs), 159 "targetWords": targetWords, 160 "maxWords": maxWords, 161 "minWordsPerArticle": minWordsPerArticle, 162 } 163 digest = hashlib.md5( 164 json.dumps(payload, sort_keys=True, ensure_ascii=True).encode("utf-8") 165 ).hexdigest() 166 return f"{self.folder}.fetch-cache-{digest}.json" 167 168 def flushCache(self, mode: CacheFlushMode = "all") -> dict[str, int | str]: 169 """ 170 Flush scraper cache files from disk and clear in-memory cache state. 171 172 Args: 173 mode: Which cache files to remove: 174 - "fetch": Remove `.fetch-cache-*.json` files. 175 - "articles": Remove `wiki-*.txt` files. 176 - "all": Remove both cache types. 177 178 Returns: 179 Dict with deletion counters and selected mode. 180 """ 181 deleted_fetch = 0 182 deleted_articles = 0 183 184 try: 185 folder_entries = os.listdir(self.folder) 186 except Exception as e: 187 logger.warning("Unable to list cache folder {}: {}", self.folder, e) 188 folder_entries = [] 189 190 for entry in folder_entries: 191 if ( 192 mode in ("fetch", "all") 193 and entry.startswith(".fetch-cache-") 194 and entry.endswith(".json") 195 ): 196 file_path = f"{self.folder}{entry}" 197 try: 198 os.remove(file_path) 199 deleted_fetch += 1 200 except Exception as e: 201 logger.warning("Unable to delete fetch cache {}: {}", file_path, e) 202 203 if ( 204 mode in ("articles", "all") 205 and entry.startswith("wiki-") 206 and entry.endswith(".txt") 207 ): 208 file_path = f"{self.folder}{entry}" 209 try: 210 os.remove(file_path) 211 deleted_articles += 1 212 except Exception as e: 213 logger.warning( 214 "Unable to delete article cache {}: {}", file_path, e 215 ) 216 217 self.articles = {} 218 self._titleCache = {} 219 self.metadata = { 220 "slugs": [], 221 "actualWords": 0, 222 "articleCount": 0, 223 "truncated": False, 224 "requestedArticles": 0, 225 } 226 227 result = { 228 "mode": mode, 229 "deletedFetchCaches": deleted_fetch, 230 "deletedArticleCaches": deleted_articles, 231 } 232 logger.info("Flushed cache: {}", result) 233 return result 234 235 def fetchArticle(self, slug: str) -> str: 236 """ 237 Fetches a Wikipedia article by slug, saves it, and returns parsed content. 238 239 Args: 240 slug: The Wikipedia page slug. 241 242 Returns: 243 The parsed article content. 244 """ 245 246 def _saveArticle(slug: str) -> str: 247 """ 248 Saves the Wikipedia article as a .txt file and returns its content. 249 250 Args: 251 slug: The Wikipedia page slug. 252 253 Returns: 254 The saved article content. 255 """ 256 logger.debug("Saving article for {}", slug) 257 258 max_retries = 4 259 for attempt in range(max_retries): 260 try: 261 page = wikipedia.page(slug, auto_suggest=False) 262 break 263 except (json.JSONDecodeError, ValueError) as e: 264 if attempt < max_retries - 1: 265 wait = 2**attempt 266 logger.warning( 267 "Empty/invalid response fetching {} (attempt {}/{}), retrying in {}s: {}", 268 slug, 269 attempt + 1, 270 max_retries, 271 wait, 272 e, 273 ) 274 time.sleep(wait) 275 else: 276 raise 277 278 with open(locateFile(slug), "w") as f: 279 pageClean = ( 280 KTextCleaner(page.content) 281 .cleanWikipedia() 282 .sanitizeForbidden() 283 .removeLaTeX() 284 .cleanPunctuation() 285 .improvePunctuation() 286 .cleanWhitespace() 287 .get() 288 ) 289 output = f"{page.url}\n\n{pageClean}" 290 f.write(output) 291 return output 292 293 def _parseArticle(input: str) -> str: 294 """ 295 Parses the saved article content, removing URL and formatting. 296 297 Args: 298 input: The raw article content. 299 300 Returns: 301 The cleaned article content. 302 """ 303 # Strip article URL from first line 304 output = re.sub(r"^http.+\n+", "", input) 305 # Strip == Headline == formatting 306 output = re.sub(r"(?<=\n)=+\s+|\s+=+(?=\n)", "", output) 307 308 return output 309 310 if slug in self._titleCache: 311 pageTitle = self._titleCache[slug] 312 else: 313 pageTitle = self._findWikiTitle(slug) 314 self._titleCache[slug] = pageTitle 315 locateFile = lambda pageSlug: ( 316 f"{self.folder}wiki-{content.toKebabCase(pageSlug)}.txt" 317 ) 318 filePath = locateFile(pageTitle) 319 320 if files.isFile(filePath): 321 result = files.readFile(filePath) 322 else: 323 try: 324 result = _saveArticle(pageTitle) 325 except Exception as e: 326 logger.warning("Unable to save {}: {}", pageTitle, e) 327 raise 328 329 return _parseArticle(result) 330 331 def fetchArticles( 332 self, 333 slugs: list[str], 334 targetWords: int = None, 335 maxWords: int = None, 336 minWordsPerArticle: int = None, 337 ) -> "KScraper": 338 """ 339 Fetches and parses multiple Wikipedia articles with smart size management. 340 Stores articles in self.articles dict as {slug: content}. 341 Uses instance mode to automatically determine size parameters if not provided. 342 343 Args: 344 slugs: A list of Wikipedia page slugs. 345 targetWords: Target word count (distributes quota across articles). If None, uses mode-based estimate. 346 maxWords: Hard maximum word count (stops fetching when reached). If None, uses mode-based estimate. 347 minWordsPerArticle: Skip articles shorter than this. If None, uses mode-based estimate. 348 349 Returns self for chaining. 350 """ 351 # Apply mode-based defaults if parameters not provided 352 if targetWords is None or maxWords is None or minWordsPerArticle is None: 353 params = self.estimateOptimalSize(len(slugs), self.mode) 354 targetWords = targetWords or params["targetWords"] 355 maxWords = maxWords or params["maxWords"] 356 minWordsPerArticle = ( 357 minWordsPerArticle 358 if minWordsPerArticle is not None 359 else params["minWordsPerArticle"] 360 ) 361 362 if not slugs: 363 self.articles = {} 364 self.metadata = { 365 "slugs": [], 366 "actualWords": 0, 367 "articleCount": 0, 368 "truncated": False, 369 "requestedArticles": 0, 370 } 371 return self 372 373 uniqueSlugs = list(dict.fromkeys(slugs)) 374 375 cachePath = self._fetchCachePath( 376 slugs, targetWords, maxWords, minWordsPerArticle 377 ) 378 if files.isFile(cachePath): 379 try: 380 payload = files.readFile(cachePath, mode="json") 381 cachedArticles = payload.get("articles", {}) 382 self.articles = { 383 slug: cachedArticles[slug] 384 for slug in slugs 385 if slug in cachedArticles 386 } 387 self.metadata = payload.get("metadata", {}) 388 logger.trace("Loaded fetch cache: {}", cachePath) 389 return self 390 except Exception as e: 391 logger.warning("Invalid fetch cache {}: {}", cachePath, e) 392 393 fetched_slugs = [] 394 total_words = 0 395 truncated = False 396 articles_fetched = 0 397 398 # Calculate per-article quota if target is specified 399 quota_per_article = targetWords // len(slugs) if targetWords and slugs else None 400 401 rawResults = {} 402 with ThreadPoolExecutor(max_workers=self.maxWorkers) as executor: 403 futures = { 404 executor.submit(self.fetchArticle, slug): slug for slug in uniqueSlugs 405 } 406 for future in as_completed(futures): 407 slug = futures[future] 408 try: 409 rawResults[slug] = future.result() 410 except Exception as e: 411 logger.warning(f"Failed to fetch {slug}: {e}") 412 413 for slug in slugs: 414 # Stop if we've hit the hard maximum 415 if maxWords and total_words >= maxWords: 416 logger.info(f"Reached maxWords limit ({maxWords}), stopping fetch") 417 truncated = True 418 break 419 420 article_text = rawResults.get(slug) 421 if not article_text: 422 continue 423 424 word_count = len(article_text.split()) 425 426 # Skip articles that are too short 427 if word_count < minWordsPerArticle: 428 logger.debug( 429 f"Skipping {slug} ({word_count} words < {minWordsPerArticle} minimum)" 430 ) 431 continue 432 433 # Apply quota-based truncation if target is set 434 if quota_per_article and word_count > quota_per_article: 435 article_text = self._truncateAtSentence(article_text, quota_per_article) 436 word_count = len(article_text.split()) 437 truncated = True 438 logger.trace(f"Truncated {slug} to ~{quota_per_article} words") 439 440 # Apply hard max truncation 441 remaining_quota = maxWords - total_words if maxWords else None 442 if remaining_quota and word_count > remaining_quota: 443 article_text = self._truncateAtSentence(article_text, remaining_quota) 444 word_count = len(article_text.split()) 445 truncated = True 446 logger.trace( 447 f"Truncated {slug} to fit remaining quota ({remaining_quota} words)" 448 ) 449 450 # Store in articles dict 451 self.articles[slug] = article_text 452 fetched_slugs.append(slug) 453 total_words += word_count 454 articles_fetched += 1 455 456 logger.trace(f"Fetched {slug}: {word_count} words (total: {total_words})") 457 458 self.metadata = { 459 "slugs": fetched_slugs, 460 "actualWords": total_words, 461 "articleCount": articles_fetched, 462 "truncated": truncated, 463 "requestedArticles": len(slugs), 464 } 465 logger.trace(self.metadata) 466 467 try: 468 with open(cachePath, "w", encoding="utf-8") as f: 469 json.dump( 470 {"articles": self.articles, "metadata": self.metadata}, 471 f, 472 ensure_ascii=False, 473 ) 474 except Exception as e: 475 logger.warning("Unable to write fetch cache {}: {}", cachePath, e) 476 477 return self 478 479 def _truncateAtSentence(self, text: str, target_words: int) -> str: 480 """ 481 Truncate text at sentence boundary closest to target word count. 482 483 Args: 484 text: Text to truncate. 485 target_words: Target word count. 486 487 Returns: 488 Truncated text ending at a sentence boundary. 489 """ 490 sentences = self._splitSentences(text) 491 result = [] 492 word_count = 0 493 494 for sentence in sentences: 495 sentence_words = len(sentence.split()) 496 if word_count + sentence_words > target_words: 497 break 498 result.append(sentence) 499 word_count += sentence_words 500 501 return " ".join(result) 502 503 def compileArticles(self, slugs: list[str] = None, separator: str = "\n\n") -> str: 504 """ 505 Compile fetched articles into a single concatenated string. 506 507 Args: 508 slugs: List of specific slugs to compile. If None, compiles all stored articles. 509 separator: String to join articles with (default: "\\n\\n"). 510 511 Returns: 512 Concatenated article content as a single string. 513 """ 514 if slugs is None: 515 # Compile all articles 516 articles_to_compile = list(self.articles.values()) 517 else: 518 # Compile only specified slugs 519 articles_to_compile = [ 520 self.articles[slug] for slug in slugs if slug in self.articles 521 ] 522 523 if not articles_to_compile: 524 raise ValueError( 525 "No articles to compile. Check if slugs are correct and articles are fetched." 526 ) 527 return separator.join(articles_to_compile)
18class KScraper: 19 ScraperMode = Literal["quick", "balanced", "comprehensive"] 20 metadata: dict 21 """Metadata about the last fetchArticles call.""" 22 23 @staticmethod 24 def estimateOptimalSize(article_count: int, mode: ScraperMode = "quick") -> dict: 25 """ 26 Recommends optimal text size parameters based on article count and use case. 27 28 Args: 29 article_count: Number of articles to fetch. 30 mode: Processing mode indicating desired size and depth of content. 31 32 Returns: 33 Dict with recommended 'targetWords', 'maxWords', and 'minWordsPerArticle'. 34 """ 35 modes = { 36 "quick": {"base": 10000, "max": 15000, "min_per": 250}, 37 "balanced": {"base": 20000, "max": 30000, "min_per": 300}, 38 "comprehensive": {"base": 40000, "max": 50000, "min_per": 500}, 39 } 40 41 config = modes.get(mode, modes["quick"]) 42 43 # Adjust target based on article count 44 # More articles = higher target (but capped at max) 45 target = min(config["base"] * (1 + article_count // 5), config["max"]) 46 47 return { 48 "targetWords": target, 49 "maxWords": config["max"], 50 "minWordsPerArticle": config["min_per"], 51 } 52 53 @staticmethod 54 def _findWikiTitle(slug: str) -> str: 55 """ 56 Finds a valid Wikipedia page title or its closest match. 57 58 Args: 59 slug: The Wikipedia page slug. 60 61 Returns: 62 The resolved Wikipedia page title. 63 """ 64 max_retries = 4 65 for attempt in range(max_retries): 66 try: 67 try: 68 # Find exact match 69 page = wikipedia.page(slug, auto_suggest=False) 70 except wikipedia.exceptions.PageError as e: 71 # Correct spelling error 72 suggested = wikipedia.suggest(e.pageid) 73 logger.warning("Correcting {} to {}", e.pageid, suggested) 74 page = wikipedia.page(suggested, auto_suggest=False) 75 except wikipedia.exceptions.DisambiguationError as e: 76 # Pick first possible match if it may refer to multiple pages 77 logger.warning("Possible pages for {}: {}", slug, e.options) 78 page = wikipedia.page(helpers.pickFirst(e.options), auto_suggest=False) 79 except (json.JSONDecodeError, ValueError) as e: 80 if attempt < max_retries - 1: 81 wait = 2**attempt 82 logger.warning( 83 "Empty/invalid response resolving title for {} (attempt {}/{}), retrying in {}s: {}", 84 slug, 85 attempt + 1, 86 max_retries, 87 wait, 88 e, 89 ) 90 time.sleep(wait) 91 continue 92 raise 93 break 94 95 return page.title 96 97 @staticmethod 98 def _splitSentences(text: str) -> list[str]: 99 """ 100 Split text into sentences (simple implementation). 101 102 Args: 103 text: Input text to split. 104 105 Returns: 106 List of sentences. 107 """ 108 # Simple sentence splitting on common terminators 109 sentences = re.split(r"(?<=[.!?])\s+", text) 110 return [s.strip() for s in sentences if s.strip()] 111 112 def __init__( 113 self, 114 folder="01 Content/articles", 115 language="en", 116 mode: ScraperMode = "quick", 117 maxWorkers: int = 4, 118 ) -> None: 119 """ 120 Initializes the KScraper with a target folder, language, and processing mode. 121 122 Args: 123 folder: Folder partial where articles will be saved. 124 language: The language for Wikipedia articles. 125 mode: Processing mode 126 - `quick` (10-15k words) 127 - `balanced` (20-30k words) 128 - `comprehensive` (40-50k words) 129 maxWorkers: Number of concurrent fetch workers. 130 """ 131 # Prepend absolute path in case specimen not run from 03 DrawBot/ 132 self.folder = files.createFolder( 133 f"/Users/christianjansky/Library/CloudStorage/Dropbox/KOMETA-Draw/{folder.rstrip('/')}-{language}/" 134 ) 135 """Full path to the file system folder where articles will be saved.""" 136 self.mode = mode 137 """Processing mode for size estimation.""" 138 self.language = language 139 """Language code used for Wikipedia queries and cache keys.""" 140 self.maxWorkers = max(1, int(maxWorkers)) 141 """Maximum number of concurrent workers used by fetchArticles.""" 142 self.articles = {} 143 """Dict storing fetched articles as {slug: content}.""" 144 self._titleCache = {} 145 """Per-instance cache for resolved page titles by requested slug.""" 146 wikipedia.set_lang(language) 147 148 def _fetchCachePath( 149 self, 150 slugs: list[str], 151 targetWords: int | None, 152 maxWords: int | None, 153 minWordsPerArticle: int | None, 154 ) -> str: 155 """Return deterministic path for a fetch result cache file.""" 156 payload = { 157 "language": self.language, 158 "mode": self.mode, 159 "slugs": sorted(slugs), 160 "targetWords": targetWords, 161 "maxWords": maxWords, 162 "minWordsPerArticle": minWordsPerArticle, 163 } 164 digest = hashlib.md5( 165 json.dumps(payload, sort_keys=True, ensure_ascii=True).encode("utf-8") 166 ).hexdigest() 167 return f"{self.folder}.fetch-cache-{digest}.json" 168 169 def flushCache(self, mode: CacheFlushMode = "all") -> dict[str, int | str]: 170 """ 171 Flush scraper cache files from disk and clear in-memory cache state. 172 173 Args: 174 mode: Which cache files to remove: 175 - "fetch": Remove `.fetch-cache-*.json` files. 176 - "articles": Remove `wiki-*.txt` files. 177 - "all": Remove both cache types. 178 179 Returns: 180 Dict with deletion counters and selected mode. 181 """ 182 deleted_fetch = 0 183 deleted_articles = 0 184 185 try: 186 folder_entries = os.listdir(self.folder) 187 except Exception as e: 188 logger.warning("Unable to list cache folder {}: {}", self.folder, e) 189 folder_entries = [] 190 191 for entry in folder_entries: 192 if ( 193 mode in ("fetch", "all") 194 and entry.startswith(".fetch-cache-") 195 and entry.endswith(".json") 196 ): 197 file_path = f"{self.folder}{entry}" 198 try: 199 os.remove(file_path) 200 deleted_fetch += 1 201 except Exception as e: 202 logger.warning("Unable to delete fetch cache {}: {}", file_path, e) 203 204 if ( 205 mode in ("articles", "all") 206 and entry.startswith("wiki-") 207 and entry.endswith(".txt") 208 ): 209 file_path = f"{self.folder}{entry}" 210 try: 211 os.remove(file_path) 212 deleted_articles += 1 213 except Exception as e: 214 logger.warning( 215 "Unable to delete article cache {}: {}", file_path, e 216 ) 217 218 self.articles = {} 219 self._titleCache = {} 220 self.metadata = { 221 "slugs": [], 222 "actualWords": 0, 223 "articleCount": 0, 224 "truncated": False, 225 "requestedArticles": 0, 226 } 227 228 result = { 229 "mode": mode, 230 "deletedFetchCaches": deleted_fetch, 231 "deletedArticleCaches": deleted_articles, 232 } 233 logger.info("Flushed cache: {}", result) 234 return result 235 236 def fetchArticle(self, slug: str) -> str: 237 """ 238 Fetches a Wikipedia article by slug, saves it, and returns parsed content. 239 240 Args: 241 slug: The Wikipedia page slug. 242 243 Returns: 244 The parsed article content. 245 """ 246 247 def _saveArticle(slug: str) -> str: 248 """ 249 Saves the Wikipedia article as a .txt file and returns its content. 250 251 Args: 252 slug: The Wikipedia page slug. 253 254 Returns: 255 The saved article content. 256 """ 257 logger.debug("Saving article for {}", slug) 258 259 max_retries = 4 260 for attempt in range(max_retries): 261 try: 262 page = wikipedia.page(slug, auto_suggest=False) 263 break 264 except (json.JSONDecodeError, ValueError) as e: 265 if attempt < max_retries - 1: 266 wait = 2**attempt 267 logger.warning( 268 "Empty/invalid response fetching {} (attempt {}/{}), retrying in {}s: {}", 269 slug, 270 attempt + 1, 271 max_retries, 272 wait, 273 e, 274 ) 275 time.sleep(wait) 276 else: 277 raise 278 279 with open(locateFile(slug), "w") as f: 280 pageClean = ( 281 KTextCleaner(page.content) 282 .cleanWikipedia() 283 .sanitizeForbidden() 284 .removeLaTeX() 285 .cleanPunctuation() 286 .improvePunctuation() 287 .cleanWhitespace() 288 .get() 289 ) 290 output = f"{page.url}\n\n{pageClean}" 291 f.write(output) 292 return output 293 294 def _parseArticle(input: str) -> str: 295 """ 296 Parses the saved article content, removing URL and formatting. 297 298 Args: 299 input: The raw article content. 300 301 Returns: 302 The cleaned article content. 303 """ 304 # Strip article URL from first line 305 output = re.sub(r"^http.+\n+", "", input) 306 # Strip == Headline == formatting 307 output = re.sub(r"(?<=\n)=+\s+|\s+=+(?=\n)", "", output) 308 309 return output 310 311 if slug in self._titleCache: 312 pageTitle = self._titleCache[slug] 313 else: 314 pageTitle = self._findWikiTitle(slug) 315 self._titleCache[slug] = pageTitle 316 locateFile = lambda pageSlug: ( 317 f"{self.folder}wiki-{content.toKebabCase(pageSlug)}.txt" 318 ) 319 filePath = locateFile(pageTitle) 320 321 if files.isFile(filePath): 322 result = files.readFile(filePath) 323 else: 324 try: 325 result = _saveArticle(pageTitle) 326 except Exception as e: 327 logger.warning("Unable to save {}: {}", pageTitle, e) 328 raise 329 330 return _parseArticle(result) 331 332 def fetchArticles( 333 self, 334 slugs: list[str], 335 targetWords: int = None, 336 maxWords: int = None, 337 minWordsPerArticle: int = None, 338 ) -> "KScraper": 339 """ 340 Fetches and parses multiple Wikipedia articles with smart size management. 341 Stores articles in self.articles dict as {slug: content}. 342 Uses instance mode to automatically determine size parameters if not provided. 343 344 Args: 345 slugs: A list of Wikipedia page slugs. 346 targetWords: Target word count (distributes quota across articles). If None, uses mode-based estimate. 347 maxWords: Hard maximum word count (stops fetching when reached). If None, uses mode-based estimate. 348 minWordsPerArticle: Skip articles shorter than this. If None, uses mode-based estimate. 349 350 Returns self for chaining. 351 """ 352 # Apply mode-based defaults if parameters not provided 353 if targetWords is None or maxWords is None or minWordsPerArticle is None: 354 params = self.estimateOptimalSize(len(slugs), self.mode) 355 targetWords = targetWords or params["targetWords"] 356 maxWords = maxWords or params["maxWords"] 357 minWordsPerArticle = ( 358 minWordsPerArticle 359 if minWordsPerArticle is not None 360 else params["minWordsPerArticle"] 361 ) 362 363 if not slugs: 364 self.articles = {} 365 self.metadata = { 366 "slugs": [], 367 "actualWords": 0, 368 "articleCount": 0, 369 "truncated": False, 370 "requestedArticles": 0, 371 } 372 return self 373 374 uniqueSlugs = list(dict.fromkeys(slugs)) 375 376 cachePath = self._fetchCachePath( 377 slugs, targetWords, maxWords, minWordsPerArticle 378 ) 379 if files.isFile(cachePath): 380 try: 381 payload = files.readFile(cachePath, mode="json") 382 cachedArticles = payload.get("articles", {}) 383 self.articles = { 384 slug: cachedArticles[slug] 385 for slug in slugs 386 if slug in cachedArticles 387 } 388 self.metadata = payload.get("metadata", {}) 389 logger.trace("Loaded fetch cache: {}", cachePath) 390 return self 391 except Exception as e: 392 logger.warning("Invalid fetch cache {}: {}", cachePath, e) 393 394 fetched_slugs = [] 395 total_words = 0 396 truncated = False 397 articles_fetched = 0 398 399 # Calculate per-article quota if target is specified 400 quota_per_article = targetWords // len(slugs) if targetWords and slugs else None 401 402 rawResults = {} 403 with ThreadPoolExecutor(max_workers=self.maxWorkers) as executor: 404 futures = { 405 executor.submit(self.fetchArticle, slug): slug for slug in uniqueSlugs 406 } 407 for future in as_completed(futures): 408 slug = futures[future] 409 try: 410 rawResults[slug] = future.result() 411 except Exception as e: 412 logger.warning(f"Failed to fetch {slug}: {e}") 413 414 for slug in slugs: 415 # Stop if we've hit the hard maximum 416 if maxWords and total_words >= maxWords: 417 logger.info(f"Reached maxWords limit ({maxWords}), stopping fetch") 418 truncated = True 419 break 420 421 article_text = rawResults.get(slug) 422 if not article_text: 423 continue 424 425 word_count = len(article_text.split()) 426 427 # Skip articles that are too short 428 if word_count < minWordsPerArticle: 429 logger.debug( 430 f"Skipping {slug} ({word_count} words < {minWordsPerArticle} minimum)" 431 ) 432 continue 433 434 # Apply quota-based truncation if target is set 435 if quota_per_article and word_count > quota_per_article: 436 article_text = self._truncateAtSentence(article_text, quota_per_article) 437 word_count = len(article_text.split()) 438 truncated = True 439 logger.trace(f"Truncated {slug} to ~{quota_per_article} words") 440 441 # Apply hard max truncation 442 remaining_quota = maxWords - total_words if maxWords else None 443 if remaining_quota and word_count > remaining_quota: 444 article_text = self._truncateAtSentence(article_text, remaining_quota) 445 word_count = len(article_text.split()) 446 truncated = True 447 logger.trace( 448 f"Truncated {slug} to fit remaining quota ({remaining_quota} words)" 449 ) 450 451 # Store in articles dict 452 self.articles[slug] = article_text 453 fetched_slugs.append(slug) 454 total_words += word_count 455 articles_fetched += 1 456 457 logger.trace(f"Fetched {slug}: {word_count} words (total: {total_words})") 458 459 self.metadata = { 460 "slugs": fetched_slugs, 461 "actualWords": total_words, 462 "articleCount": articles_fetched, 463 "truncated": truncated, 464 "requestedArticles": len(slugs), 465 } 466 logger.trace(self.metadata) 467 468 try: 469 with open(cachePath, "w", encoding="utf-8") as f: 470 json.dump( 471 {"articles": self.articles, "metadata": self.metadata}, 472 f, 473 ensure_ascii=False, 474 ) 475 except Exception as e: 476 logger.warning("Unable to write fetch cache {}: {}", cachePath, e) 477 478 return self 479 480 def _truncateAtSentence(self, text: str, target_words: int) -> str: 481 """ 482 Truncate text at sentence boundary closest to target word count. 483 484 Args: 485 text: Text to truncate. 486 target_words: Target word count. 487 488 Returns: 489 Truncated text ending at a sentence boundary. 490 """ 491 sentences = self._splitSentences(text) 492 result = [] 493 word_count = 0 494 495 for sentence in sentences: 496 sentence_words = len(sentence.split()) 497 if word_count + sentence_words > target_words: 498 break 499 result.append(sentence) 500 word_count += sentence_words 501 502 return " ".join(result) 503 504 def compileArticles(self, slugs: list[str] = None, separator: str = "\n\n") -> str: 505 """ 506 Compile fetched articles into a single concatenated string. 507 508 Args: 509 slugs: List of specific slugs to compile. If None, compiles all stored articles. 510 separator: String to join articles with (default: "\\n\\n"). 511 512 Returns: 513 Concatenated article content as a single string. 514 """ 515 if slugs is None: 516 # Compile all articles 517 articles_to_compile = list(self.articles.values()) 518 else: 519 # Compile only specified slugs 520 articles_to_compile = [ 521 self.articles[slug] for slug in slugs if slug in self.articles 522 ] 523 524 if not articles_to_compile: 525 raise ValueError( 526 "No articles to compile. Check if slugs are correct and articles are fetched." 527 ) 528 return separator.join(articles_to_compile)
112 def __init__( 113 self, 114 folder="01 Content/articles", 115 language="en", 116 mode: ScraperMode = "quick", 117 maxWorkers: int = 4, 118 ) -> None: 119 """ 120 Initializes the KScraper with a target folder, language, and processing mode. 121 122 Args: 123 folder: Folder partial where articles will be saved. 124 language: The language for Wikipedia articles. 125 mode: Processing mode 126 - `quick` (10-15k words) 127 - `balanced` (20-30k words) 128 - `comprehensive` (40-50k words) 129 maxWorkers: Number of concurrent fetch workers. 130 """ 131 # Prepend absolute path in case specimen not run from 03 DrawBot/ 132 self.folder = files.createFolder( 133 f"/Users/christianjansky/Library/CloudStorage/Dropbox/KOMETA-Draw/{folder.rstrip('/')}-{language}/" 134 ) 135 """Full path to the file system folder where articles will be saved.""" 136 self.mode = mode 137 """Processing mode for size estimation.""" 138 self.language = language 139 """Language code used for Wikipedia queries and cache keys.""" 140 self.maxWorkers = max(1, int(maxWorkers)) 141 """Maximum number of concurrent workers used by fetchArticles.""" 142 self.articles = {} 143 """Dict storing fetched articles as {slug: content}.""" 144 self._titleCache = {} 145 """Per-instance cache for resolved page titles by requested slug.""" 146 wikipedia.set_lang(language)
Initializes the KScraper with a target folder, language, and processing mode.
Arguments:
- folder: Folder partial where articles will be saved.
- language: The language for Wikipedia articles.
- mode: Processing mode
quick(10-15k words)balanced(20-30k words)comprehensive(40-50k words)
- maxWorkers: Number of concurrent fetch workers.
23 @staticmethod 24 def estimateOptimalSize(article_count: int, mode: ScraperMode = "quick") -> dict: 25 """ 26 Recommends optimal text size parameters based on article count and use case. 27 28 Args: 29 article_count: Number of articles to fetch. 30 mode: Processing mode indicating desired size and depth of content. 31 32 Returns: 33 Dict with recommended 'targetWords', 'maxWords', and 'minWordsPerArticle'. 34 """ 35 modes = { 36 "quick": {"base": 10000, "max": 15000, "min_per": 250}, 37 "balanced": {"base": 20000, "max": 30000, "min_per": 300}, 38 "comprehensive": {"base": 40000, "max": 50000, "min_per": 500}, 39 } 40 41 config = modes.get(mode, modes["quick"]) 42 43 # Adjust target based on article count 44 # More articles = higher target (but capped at max) 45 target = min(config["base"] * (1 + article_count // 5), config["max"]) 46 47 return { 48 "targetWords": target, 49 "maxWords": config["max"], 50 "minWordsPerArticle": config["min_per"], 51 }
Recommends optimal text size parameters based on article count and use case.
Arguments:
- article_count: Number of articles to fetch.
- mode: Processing mode indicating desired size and depth of content.
Returns:
Dict with recommended 'targetWords', 'maxWords', and 'minWordsPerArticle'.
169 def flushCache(self, mode: CacheFlushMode = "all") -> dict[str, int | str]: 170 """ 171 Flush scraper cache files from disk and clear in-memory cache state. 172 173 Args: 174 mode: Which cache files to remove: 175 - "fetch": Remove `.fetch-cache-*.json` files. 176 - "articles": Remove `wiki-*.txt` files. 177 - "all": Remove both cache types. 178 179 Returns: 180 Dict with deletion counters and selected mode. 181 """ 182 deleted_fetch = 0 183 deleted_articles = 0 184 185 try: 186 folder_entries = os.listdir(self.folder) 187 except Exception as e: 188 logger.warning("Unable to list cache folder {}: {}", self.folder, e) 189 folder_entries = [] 190 191 for entry in folder_entries: 192 if ( 193 mode in ("fetch", "all") 194 and entry.startswith(".fetch-cache-") 195 and entry.endswith(".json") 196 ): 197 file_path = f"{self.folder}{entry}" 198 try: 199 os.remove(file_path) 200 deleted_fetch += 1 201 except Exception as e: 202 logger.warning("Unable to delete fetch cache {}: {}", file_path, e) 203 204 if ( 205 mode in ("articles", "all") 206 and entry.startswith("wiki-") 207 and entry.endswith(".txt") 208 ): 209 file_path = f"{self.folder}{entry}" 210 try: 211 os.remove(file_path) 212 deleted_articles += 1 213 except Exception as e: 214 logger.warning( 215 "Unable to delete article cache {}: {}", file_path, e 216 ) 217 218 self.articles = {} 219 self._titleCache = {} 220 self.metadata = { 221 "slugs": [], 222 "actualWords": 0, 223 "articleCount": 0, 224 "truncated": False, 225 "requestedArticles": 0, 226 } 227 228 result = { 229 "mode": mode, 230 "deletedFetchCaches": deleted_fetch, 231 "deletedArticleCaches": deleted_articles, 232 } 233 logger.info("Flushed cache: {}", result) 234 return result
Flush scraper cache files from disk and clear in-memory cache state.
Arguments:
- mode: Which cache files to remove:
- "fetch": Remove
.fetch-cache-*.jsonfiles. - "articles": Remove
wiki-*.txtfiles. - "all": Remove both cache types.
- "fetch": Remove
Returns:
Dict with deletion counters and selected mode.
236 def fetchArticle(self, slug: str) -> str: 237 """ 238 Fetches a Wikipedia article by slug, saves it, and returns parsed content. 239 240 Args: 241 slug: The Wikipedia page slug. 242 243 Returns: 244 The parsed article content. 245 """ 246 247 def _saveArticle(slug: str) -> str: 248 """ 249 Saves the Wikipedia article as a .txt file and returns its content. 250 251 Args: 252 slug: The Wikipedia page slug. 253 254 Returns: 255 The saved article content. 256 """ 257 logger.debug("Saving article for {}", slug) 258 259 max_retries = 4 260 for attempt in range(max_retries): 261 try: 262 page = wikipedia.page(slug, auto_suggest=False) 263 break 264 except (json.JSONDecodeError, ValueError) as e: 265 if attempt < max_retries - 1: 266 wait = 2**attempt 267 logger.warning( 268 "Empty/invalid response fetching {} (attempt {}/{}), retrying in {}s: {}", 269 slug, 270 attempt + 1, 271 max_retries, 272 wait, 273 e, 274 ) 275 time.sleep(wait) 276 else: 277 raise 278 279 with open(locateFile(slug), "w") as f: 280 pageClean = ( 281 KTextCleaner(page.content) 282 .cleanWikipedia() 283 .sanitizeForbidden() 284 .removeLaTeX() 285 .cleanPunctuation() 286 .improvePunctuation() 287 .cleanWhitespace() 288 .get() 289 ) 290 output = f"{page.url}\n\n{pageClean}" 291 f.write(output) 292 return output 293 294 def _parseArticle(input: str) -> str: 295 """ 296 Parses the saved article content, removing URL and formatting. 297 298 Args: 299 input: The raw article content. 300 301 Returns: 302 The cleaned article content. 303 """ 304 # Strip article URL from first line 305 output = re.sub(r"^http.+\n+", "", input) 306 # Strip == Headline == formatting 307 output = re.sub(r"(?<=\n)=+\s+|\s+=+(?=\n)", "", output) 308 309 return output 310 311 if slug in self._titleCache: 312 pageTitle = self._titleCache[slug] 313 else: 314 pageTitle = self._findWikiTitle(slug) 315 self._titleCache[slug] = pageTitle 316 locateFile = lambda pageSlug: ( 317 f"{self.folder}wiki-{content.toKebabCase(pageSlug)}.txt" 318 ) 319 filePath = locateFile(pageTitle) 320 321 if files.isFile(filePath): 322 result = files.readFile(filePath) 323 else: 324 try: 325 result = _saveArticle(pageTitle) 326 except Exception as e: 327 logger.warning("Unable to save {}: {}", pageTitle, e) 328 raise 329 330 return _parseArticle(result)
Fetches a Wikipedia article by slug, saves it, and returns parsed content.
Arguments:
- slug: The Wikipedia page slug.
Returns:
The parsed article content.
332 def fetchArticles( 333 self, 334 slugs: list[str], 335 targetWords: int = None, 336 maxWords: int = None, 337 minWordsPerArticle: int = None, 338 ) -> "KScraper": 339 """ 340 Fetches and parses multiple Wikipedia articles with smart size management. 341 Stores articles in self.articles dict as {slug: content}. 342 Uses instance mode to automatically determine size parameters if not provided. 343 344 Args: 345 slugs: A list of Wikipedia page slugs. 346 targetWords: Target word count (distributes quota across articles). If None, uses mode-based estimate. 347 maxWords: Hard maximum word count (stops fetching when reached). If None, uses mode-based estimate. 348 minWordsPerArticle: Skip articles shorter than this. If None, uses mode-based estimate. 349 350 Returns self for chaining. 351 """ 352 # Apply mode-based defaults if parameters not provided 353 if targetWords is None or maxWords is None or minWordsPerArticle is None: 354 params = self.estimateOptimalSize(len(slugs), self.mode) 355 targetWords = targetWords or params["targetWords"] 356 maxWords = maxWords or params["maxWords"] 357 minWordsPerArticle = ( 358 minWordsPerArticle 359 if minWordsPerArticle is not None 360 else params["minWordsPerArticle"] 361 ) 362 363 if not slugs: 364 self.articles = {} 365 self.metadata = { 366 "slugs": [], 367 "actualWords": 0, 368 "articleCount": 0, 369 "truncated": False, 370 "requestedArticles": 0, 371 } 372 return self 373 374 uniqueSlugs = list(dict.fromkeys(slugs)) 375 376 cachePath = self._fetchCachePath( 377 slugs, targetWords, maxWords, minWordsPerArticle 378 ) 379 if files.isFile(cachePath): 380 try: 381 payload = files.readFile(cachePath, mode="json") 382 cachedArticles = payload.get("articles", {}) 383 self.articles = { 384 slug: cachedArticles[slug] 385 for slug in slugs 386 if slug in cachedArticles 387 } 388 self.metadata = payload.get("metadata", {}) 389 logger.trace("Loaded fetch cache: {}", cachePath) 390 return self 391 except Exception as e: 392 logger.warning("Invalid fetch cache {}: {}", cachePath, e) 393 394 fetched_slugs = [] 395 total_words = 0 396 truncated = False 397 articles_fetched = 0 398 399 # Calculate per-article quota if target is specified 400 quota_per_article = targetWords // len(slugs) if targetWords and slugs else None 401 402 rawResults = {} 403 with ThreadPoolExecutor(max_workers=self.maxWorkers) as executor: 404 futures = { 405 executor.submit(self.fetchArticle, slug): slug for slug in uniqueSlugs 406 } 407 for future in as_completed(futures): 408 slug = futures[future] 409 try: 410 rawResults[slug] = future.result() 411 except Exception as e: 412 logger.warning(f"Failed to fetch {slug}: {e}") 413 414 for slug in slugs: 415 # Stop if we've hit the hard maximum 416 if maxWords and total_words >= maxWords: 417 logger.info(f"Reached maxWords limit ({maxWords}), stopping fetch") 418 truncated = True 419 break 420 421 article_text = rawResults.get(slug) 422 if not article_text: 423 continue 424 425 word_count = len(article_text.split()) 426 427 # Skip articles that are too short 428 if word_count < minWordsPerArticle: 429 logger.debug( 430 f"Skipping {slug} ({word_count} words < {minWordsPerArticle} minimum)" 431 ) 432 continue 433 434 # Apply quota-based truncation if target is set 435 if quota_per_article and word_count > quota_per_article: 436 article_text = self._truncateAtSentence(article_text, quota_per_article) 437 word_count = len(article_text.split()) 438 truncated = True 439 logger.trace(f"Truncated {slug} to ~{quota_per_article} words") 440 441 # Apply hard max truncation 442 remaining_quota = maxWords - total_words if maxWords else None 443 if remaining_quota and word_count > remaining_quota: 444 article_text = self._truncateAtSentence(article_text, remaining_quota) 445 word_count = len(article_text.split()) 446 truncated = True 447 logger.trace( 448 f"Truncated {slug} to fit remaining quota ({remaining_quota} words)" 449 ) 450 451 # Store in articles dict 452 self.articles[slug] = article_text 453 fetched_slugs.append(slug) 454 total_words += word_count 455 articles_fetched += 1 456 457 logger.trace(f"Fetched {slug}: {word_count} words (total: {total_words})") 458 459 self.metadata = { 460 "slugs": fetched_slugs, 461 "actualWords": total_words, 462 "articleCount": articles_fetched, 463 "truncated": truncated, 464 "requestedArticles": len(slugs), 465 } 466 logger.trace(self.metadata) 467 468 try: 469 with open(cachePath, "w", encoding="utf-8") as f: 470 json.dump( 471 {"articles": self.articles, "metadata": self.metadata}, 472 f, 473 ensure_ascii=False, 474 ) 475 except Exception as e: 476 logger.warning("Unable to write fetch cache {}: {}", cachePath, e) 477 478 return self
Fetches and parses multiple Wikipedia articles with smart size management. Stores articles in self.articles dict as {slug: content}. Uses instance mode to automatically determine size parameters if not provided.
Arguments:
- slugs: A list of Wikipedia page slugs.
- targetWords: Target word count (distributes quota across articles). If None, uses mode-based estimate.
- maxWords: Hard maximum word count (stops fetching when reached). If None, uses mode-based estimate.
- minWordsPerArticle: Skip articles shorter than this. If None, uses mode-based estimate.
Returns self for chaining.
504 def compileArticles(self, slugs: list[str] = None, separator: str = "\n\n") -> str: 505 """ 506 Compile fetched articles into a single concatenated string. 507 508 Args: 509 slugs: List of specific slugs to compile. If None, compiles all stored articles. 510 separator: String to join articles with (default: "\\n\\n"). 511 512 Returns: 513 Concatenated article content as a single string. 514 """ 515 if slugs is None: 516 # Compile all articles 517 articles_to_compile = list(self.articles.values()) 518 else: 519 # Compile only specified slugs 520 articles_to_compile = [ 521 self.articles[slug] for slug in slugs if slug in self.articles 522 ] 523 524 if not articles_to_compile: 525 raise ValueError( 526 "No articles to compile. Check if slugs are correct and articles are fetched." 527 ) 528 return separator.join(articles_to_compile)
Compile fetched articles into a single concatenated string.
Arguments:
- slugs: List of specific slugs to compile. If None, compiles all stored articles.
- separator: String to join articles with (default: "\n\n").
Returns:
Concatenated article content as a single string.