classes.c30_scraper

  1from typing import Literal
  2import wikipedia
  3import re
  4import json
  5import hashlib
  6import os
  7import time
  8from concurrent.futures import ThreadPoolExecutor, as_completed
  9from loguru import logger
 10
 11from lib import files, helpers, content
 12from .c34_text_cleaner import KTextCleaner
 13
 14CacheFlushMode = Literal["fetch", "articles", "all"]
 15
 16
 17class KScraper:
 18    ScraperMode = Literal["quick", "balanced", "comprehensive"]
 19    metadata: dict
 20    """Metadata about the last fetchArticles call."""
 21
 22    @staticmethod
 23    def estimateOptimalSize(article_count: int, mode: ScraperMode = "quick") -> dict:
 24        """
 25        Recommends optimal text size parameters based on article count and use case.
 26
 27        Args:
 28            article_count: Number of articles to fetch.
 29            mode: Processing mode indicating desired size and depth of content.
 30
 31        Returns:
 32            Dict with recommended 'targetWords', 'maxWords', and 'minWordsPerArticle'.
 33        """
 34        modes = {
 35            "quick": {"base": 10000, "max": 15000, "min_per": 250},
 36            "balanced": {"base": 20000, "max": 30000, "min_per": 300},
 37            "comprehensive": {"base": 40000, "max": 50000, "min_per": 500},
 38        }
 39
 40        config = modes.get(mode, modes["quick"])
 41
 42        # Adjust target based on article count
 43        # More articles = higher target (but capped at max)
 44        target = min(config["base"] * (1 + article_count // 5), config["max"])
 45
 46        return {
 47            "targetWords": target,
 48            "maxWords": config["max"],
 49            "minWordsPerArticle": config["min_per"],
 50        }
 51
 52    @staticmethod
 53    def _findWikiTitle(slug: str) -> str:
 54        """
 55        Finds a valid Wikipedia page title or its closest match.
 56
 57        Args:
 58            slug: The Wikipedia page slug.
 59
 60        Returns:
 61            The resolved Wikipedia page title.
 62        """
 63        max_retries = 4
 64        for attempt in range(max_retries):
 65            try:
 66                try:
 67                    # Find exact match
 68                    page = wikipedia.page(slug, auto_suggest=False)
 69                except wikipedia.exceptions.PageError as e:
 70                    # Correct spelling error
 71                    suggested = wikipedia.suggest(e.pageid)
 72                    logger.warning("Correcting {} to {}", e.pageid, suggested)
 73                    page = wikipedia.page(suggested, auto_suggest=False)
 74            except wikipedia.exceptions.DisambiguationError as e:
 75                # Pick first possible match if it may refer to multiple pages
 76                logger.warning("Possible pages for {}: {}", slug, e.options)
 77                page = wikipedia.page(helpers.pickFirst(e.options), auto_suggest=False)
 78            except (json.JSONDecodeError, ValueError) as e:
 79                if attempt < max_retries - 1:
 80                    wait = 2**attempt
 81                    logger.warning(
 82                        "Empty/invalid response resolving title for {} (attempt {}/{}), retrying in {}s: {}",
 83                        slug,
 84                        attempt + 1,
 85                        max_retries,
 86                        wait,
 87                        e,
 88                    )
 89                    time.sleep(wait)
 90                    continue
 91                raise
 92            break
 93
 94        return page.title
 95
 96    @staticmethod
 97    def _splitSentences(text: str) -> list[str]:
 98        """
 99        Split text into sentences (simple implementation).
100
101        Args:
102            text: Input text to split.
103
104        Returns:
105            List of sentences.
106        """
107        # Simple sentence splitting on common terminators
108        sentences = re.split(r"(?<=[.!?])\s+", text)
109        return [s.strip() for s in sentences if s.strip()]
110
111    def __init__(
112        self,
113        folder="01 Content/articles",
114        language="en",
115        mode: ScraperMode = "quick",
116        maxWorkers: int = 4,
117    ) -> None:
118        """
119        Initializes the KScraper with a target folder, language, and processing mode.
120
121        Args:
122            folder: Folder partial where articles will be saved.
123            language: The language for Wikipedia articles.
124            mode: Processing mode
125                - `quick` (10-15k words)
126                - `balanced` (20-30k words)
127                - `comprehensive` (40-50k words)
128            maxWorkers: Number of concurrent fetch workers.
129        """
130        # Prepend absolute path in case specimen not run from 03 DrawBot/
131        self.folder = files.createFolder(
132            f"/Users/christianjansky/Library/CloudStorage/Dropbox/KOMETA-Draw/{folder.rstrip('/')}-{language}/"
133        )
134        """Full path to the file system folder where articles will be saved."""
135        self.mode = mode
136        """Processing mode for size estimation."""
137        self.language = language
138        """Language code used for Wikipedia queries and cache keys."""
139        self.maxWorkers = max(1, int(maxWorkers))
140        """Maximum number of concurrent workers used by fetchArticles."""
141        self.articles = {}
142        """Dict storing fetched articles as {slug: content}."""
143        self._titleCache = {}
144        """Per-instance cache for resolved page titles by requested slug."""
145        wikipedia.set_lang(language)
146
147    def _fetchCachePath(
148        self,
149        slugs: list[str],
150        targetWords: int | None,
151        maxWords: int | None,
152        minWordsPerArticle: int | None,
153    ) -> str:
154        """Return deterministic path for a fetch result cache file."""
155        payload = {
156            "language": self.language,
157            "mode": self.mode,
158            "slugs": sorted(slugs),
159            "targetWords": targetWords,
160            "maxWords": maxWords,
161            "minWordsPerArticle": minWordsPerArticle,
162        }
163        digest = hashlib.md5(
164            json.dumps(payload, sort_keys=True, ensure_ascii=True).encode("utf-8")
165        ).hexdigest()
166        return f"{self.folder}.fetch-cache-{digest}.json"
167
168    def flushCache(self, mode: CacheFlushMode = "all") -> dict[str, int | str]:
169        """
170        Flush scraper cache files from disk and clear in-memory cache state.
171
172        Args:
173            mode: Which cache files to remove:
174                - "fetch": Remove `.fetch-cache-*.json` files.
175                - "articles": Remove `wiki-*.txt` files.
176                - "all": Remove both cache types.
177
178        Returns:
179            Dict with deletion counters and selected mode.
180        """
181        deleted_fetch = 0
182        deleted_articles = 0
183
184        try:
185            folder_entries = os.listdir(self.folder)
186        except Exception as e:
187            logger.warning("Unable to list cache folder {}: {}", self.folder, e)
188            folder_entries = []
189
190        for entry in folder_entries:
191            if (
192                mode in ("fetch", "all")
193                and entry.startswith(".fetch-cache-")
194                and entry.endswith(".json")
195            ):
196                file_path = f"{self.folder}{entry}"
197                try:
198                    os.remove(file_path)
199                    deleted_fetch += 1
200                except Exception as e:
201                    logger.warning("Unable to delete fetch cache {}: {}", file_path, e)
202
203            if (
204                mode in ("articles", "all")
205                and entry.startswith("wiki-")
206                and entry.endswith(".txt")
207            ):
208                file_path = f"{self.folder}{entry}"
209                try:
210                    os.remove(file_path)
211                    deleted_articles += 1
212                except Exception as e:
213                    logger.warning(
214                        "Unable to delete article cache {}: {}", file_path, e
215                    )
216
217        self.articles = {}
218        self._titleCache = {}
219        self.metadata = {
220            "slugs": [],
221            "actualWords": 0,
222            "articleCount": 0,
223            "truncated": False,
224            "requestedArticles": 0,
225        }
226
227        result = {
228            "mode": mode,
229            "deletedFetchCaches": deleted_fetch,
230            "deletedArticleCaches": deleted_articles,
231        }
232        logger.info("Flushed cache: {}", result)
233        return result
234
235    def fetchArticle(self, slug: str) -> str:
236        """
237        Fetches a Wikipedia article by slug, saves it, and returns parsed content.
238
239        Args:
240            slug: The Wikipedia page slug.
241
242        Returns:
243            The parsed article content.
244        """
245
246        def _saveArticle(slug: str) -> str:
247            """
248            Saves the Wikipedia article as a .txt file and returns its content.
249
250            Args:
251                slug: The Wikipedia page slug.
252
253            Returns:
254                The saved article content.
255            """
256            logger.debug("Saving article for {}", slug)
257
258            max_retries = 4
259            for attempt in range(max_retries):
260                try:
261                    page = wikipedia.page(slug, auto_suggest=False)
262                    break
263                except (json.JSONDecodeError, ValueError) as e:
264                    if attempt < max_retries - 1:
265                        wait = 2**attempt
266                        logger.warning(
267                            "Empty/invalid response fetching {} (attempt {}/{}), retrying in {}s: {}",
268                            slug,
269                            attempt + 1,
270                            max_retries,
271                            wait,
272                            e,
273                        )
274                        time.sleep(wait)
275                    else:
276                        raise
277
278            with open(locateFile(slug), "w") as f:
279                pageClean = (
280                    KTextCleaner(page.content)
281                    .cleanWikipedia()
282                    .sanitizeForbidden()
283                    .removeLaTeX()
284                    .cleanPunctuation()
285                    .improvePunctuation()
286                    .cleanWhitespace()
287                    .get()
288                )
289                output = f"{page.url}\n\n{pageClean}"
290                f.write(output)
291                return output
292
293        def _parseArticle(input: str) -> str:
294            """
295            Parses the saved article content, removing URL and formatting.
296
297            Args:
298                input: The raw article content.
299
300            Returns:
301                The cleaned article content.
302            """
303            # Strip article URL from first line
304            output = re.sub(r"^http.+\n+", "", input)
305            # Strip == Headline == formatting
306            output = re.sub(r"(?<=\n)=+\s+|\s+=+(?=\n)", "", output)
307
308            return output
309
310        if slug in self._titleCache:
311            pageTitle = self._titleCache[slug]
312        else:
313            pageTitle = self._findWikiTitle(slug)
314            self._titleCache[slug] = pageTitle
315        locateFile = lambda pageSlug: (
316            f"{self.folder}wiki-{content.toKebabCase(pageSlug)}.txt"
317        )
318        filePath = locateFile(pageTitle)
319
320        if files.isFile(filePath):
321            result = files.readFile(filePath)
322        else:
323            try:
324                result = _saveArticle(pageTitle)
325            except Exception as e:
326                logger.warning("Unable to save {}: {}", pageTitle, e)
327                raise
328
329        return _parseArticle(result)
330
331    def fetchArticles(
332        self,
333        slugs: list[str],
334        targetWords: int = None,
335        maxWords: int = None,
336        minWordsPerArticle: int = None,
337    ) -> "KScraper":
338        """
339        Fetches and parses multiple Wikipedia articles with smart size management.
340        Stores articles in self.articles dict as {slug: content}.
341        Uses instance mode to automatically determine size parameters if not provided.
342
343        Args:
344            slugs: A list of Wikipedia page slugs.
345            targetWords: Target word count (distributes quota across articles). If None, uses mode-based estimate.
346            maxWords: Hard maximum word count (stops fetching when reached). If None, uses mode-based estimate.
347            minWordsPerArticle: Skip articles shorter than this. If None, uses mode-based estimate.
348
349        Returns self for chaining.
350        """
351        # Apply mode-based defaults if parameters not provided
352        if targetWords is None or maxWords is None or minWordsPerArticle is None:
353            params = self.estimateOptimalSize(len(slugs), self.mode)
354            targetWords = targetWords or params["targetWords"]
355            maxWords = maxWords or params["maxWords"]
356            minWordsPerArticle = (
357                minWordsPerArticle
358                if minWordsPerArticle is not None
359                else params["minWordsPerArticle"]
360            )
361
362        if not slugs:
363            self.articles = {}
364            self.metadata = {
365                "slugs": [],
366                "actualWords": 0,
367                "articleCount": 0,
368                "truncated": False,
369                "requestedArticles": 0,
370            }
371            return self
372
373        uniqueSlugs = list(dict.fromkeys(slugs))
374
375        cachePath = self._fetchCachePath(
376            slugs, targetWords, maxWords, minWordsPerArticle
377        )
378        if files.isFile(cachePath):
379            try:
380                payload = files.readFile(cachePath, mode="json")
381                cachedArticles = payload.get("articles", {})
382                self.articles = {
383                    slug: cachedArticles[slug]
384                    for slug in slugs
385                    if slug in cachedArticles
386                }
387                self.metadata = payload.get("metadata", {})
388                logger.trace("Loaded fetch cache: {}", cachePath)
389                return self
390            except Exception as e:
391                logger.warning("Invalid fetch cache {}: {}", cachePath, e)
392
393        fetched_slugs = []
394        total_words = 0
395        truncated = False
396        articles_fetched = 0
397
398        # Calculate per-article quota if target is specified
399        quota_per_article = targetWords // len(slugs) if targetWords and slugs else None
400
401        rawResults = {}
402        with ThreadPoolExecutor(max_workers=self.maxWorkers) as executor:
403            futures = {
404                executor.submit(self.fetchArticle, slug): slug for slug in uniqueSlugs
405            }
406            for future in as_completed(futures):
407                slug = futures[future]
408                try:
409                    rawResults[slug] = future.result()
410                except Exception as e:
411                    logger.warning(f"Failed to fetch {slug}: {e}")
412
413        for slug in slugs:
414            # Stop if we've hit the hard maximum
415            if maxWords and total_words >= maxWords:
416                logger.info(f"Reached maxWords limit ({maxWords}), stopping fetch")
417                truncated = True
418                break
419
420            article_text = rawResults.get(slug)
421            if not article_text:
422                continue
423
424            word_count = len(article_text.split())
425
426            # Skip articles that are too short
427            if word_count < minWordsPerArticle:
428                logger.debug(
429                    f"Skipping {slug} ({word_count} words < {minWordsPerArticle} minimum)"
430                )
431                continue
432
433            # Apply quota-based truncation if target is set
434            if quota_per_article and word_count > quota_per_article:
435                article_text = self._truncateAtSentence(article_text, quota_per_article)
436                word_count = len(article_text.split())
437                truncated = True
438                logger.trace(f"Truncated {slug} to ~{quota_per_article} words")
439
440            # Apply hard max truncation
441            remaining_quota = maxWords - total_words if maxWords else None
442            if remaining_quota and word_count > remaining_quota:
443                article_text = self._truncateAtSentence(article_text, remaining_quota)
444                word_count = len(article_text.split())
445                truncated = True
446                logger.trace(
447                    f"Truncated {slug} to fit remaining quota ({remaining_quota} words)"
448                )
449
450            # Store in articles dict
451            self.articles[slug] = article_text
452            fetched_slugs.append(slug)
453            total_words += word_count
454            articles_fetched += 1
455
456            logger.trace(f"Fetched {slug}: {word_count} words (total: {total_words})")
457
458        self.metadata = {
459            "slugs": fetched_slugs,
460            "actualWords": total_words,
461            "articleCount": articles_fetched,
462            "truncated": truncated,
463            "requestedArticles": len(slugs),
464        }
465        logger.trace(self.metadata)
466
467        try:
468            with open(cachePath, "w", encoding="utf-8") as f:
469                json.dump(
470                    {"articles": self.articles, "metadata": self.metadata},
471                    f,
472                    ensure_ascii=False,
473                )
474        except Exception as e:
475            logger.warning("Unable to write fetch cache {}: {}", cachePath, e)
476
477        return self
478
479    def _truncateAtSentence(self, text: str, target_words: int) -> str:
480        """
481        Truncate text at sentence boundary closest to target word count.
482
483        Args:
484            text: Text to truncate.
485            target_words: Target word count.
486
487        Returns:
488            Truncated text ending at a sentence boundary.
489        """
490        sentences = self._splitSentences(text)
491        result = []
492        word_count = 0
493
494        for sentence in sentences:
495            sentence_words = len(sentence.split())
496            if word_count + sentence_words > target_words:
497                break
498            result.append(sentence)
499            word_count += sentence_words
500
501        return " ".join(result)
502
503    def compileArticles(self, slugs: list[str] = None, separator: str = "\n\n") -> str:
504        """
505        Compile fetched articles into a single concatenated string.
506
507        Args:
508            slugs: List of specific slugs to compile. If None, compiles all stored articles.
509            separator: String to join articles with (default: "\\n\\n").
510
511        Returns:
512            Concatenated article content as a single string.
513        """
514        if slugs is None:
515            # Compile all articles
516            articles_to_compile = list(self.articles.values())
517        else:
518            # Compile only specified slugs
519            articles_to_compile = [
520                self.articles[slug] for slug in slugs if slug in self.articles
521            ]
522
523        if not articles_to_compile:
524            raise ValueError(
525                "No articles to compile. Check if slugs are correct and articles are  fetched."
526            )
527        return separator.join(articles_to_compile)
CacheFlushMode = typing.Literal['fetch', 'articles', 'all']
class KScraper:
 18class KScraper:
 19    ScraperMode = Literal["quick", "balanced", "comprehensive"]
 20    metadata: dict
 21    """Metadata about the last fetchArticles call."""
 22
 23    @staticmethod
 24    def estimateOptimalSize(article_count: int, mode: ScraperMode = "quick") -> dict:
 25        """
 26        Recommends optimal text size parameters based on article count and use case.
 27
 28        Args:
 29            article_count: Number of articles to fetch.
 30            mode: Processing mode indicating desired size and depth of content.
 31
 32        Returns:
 33            Dict with recommended 'targetWords', 'maxWords', and 'minWordsPerArticle'.
 34        """
 35        modes = {
 36            "quick": {"base": 10000, "max": 15000, "min_per": 250},
 37            "balanced": {"base": 20000, "max": 30000, "min_per": 300},
 38            "comprehensive": {"base": 40000, "max": 50000, "min_per": 500},
 39        }
 40
 41        config = modes.get(mode, modes["quick"])
 42
 43        # Adjust target based on article count
 44        # More articles = higher target (but capped at max)
 45        target = min(config["base"] * (1 + article_count // 5), config["max"])
 46
 47        return {
 48            "targetWords": target,
 49            "maxWords": config["max"],
 50            "minWordsPerArticle": config["min_per"],
 51        }
 52
 53    @staticmethod
 54    def _findWikiTitle(slug: str) -> str:
 55        """
 56        Finds a valid Wikipedia page title or its closest match.
 57
 58        Args:
 59            slug: The Wikipedia page slug.
 60
 61        Returns:
 62            The resolved Wikipedia page title.
 63        """
 64        max_retries = 4
 65        for attempt in range(max_retries):
 66            try:
 67                try:
 68                    # Find exact match
 69                    page = wikipedia.page(slug, auto_suggest=False)
 70                except wikipedia.exceptions.PageError as e:
 71                    # Correct spelling error
 72                    suggested = wikipedia.suggest(e.pageid)
 73                    logger.warning("Correcting {} to {}", e.pageid, suggested)
 74                    page = wikipedia.page(suggested, auto_suggest=False)
 75            except wikipedia.exceptions.DisambiguationError as e:
 76                # Pick first possible match if it may refer to multiple pages
 77                logger.warning("Possible pages for {}: {}", slug, e.options)
 78                page = wikipedia.page(helpers.pickFirst(e.options), auto_suggest=False)
 79            except (json.JSONDecodeError, ValueError) as e:
 80                if attempt < max_retries - 1:
 81                    wait = 2**attempt
 82                    logger.warning(
 83                        "Empty/invalid response resolving title for {} (attempt {}/{}), retrying in {}s: {}",
 84                        slug,
 85                        attempt + 1,
 86                        max_retries,
 87                        wait,
 88                        e,
 89                    )
 90                    time.sleep(wait)
 91                    continue
 92                raise
 93            break
 94
 95        return page.title
 96
 97    @staticmethod
 98    def _splitSentences(text: str) -> list[str]:
 99        """
100        Split text into sentences (simple implementation).
101
102        Args:
103            text: Input text to split.
104
105        Returns:
106            List of sentences.
107        """
108        # Simple sentence splitting on common terminators
109        sentences = re.split(r"(?<=[.!?])\s+", text)
110        return [s.strip() for s in sentences if s.strip()]
111
112    def __init__(
113        self,
114        folder="01 Content/articles",
115        language="en",
116        mode: ScraperMode = "quick",
117        maxWorkers: int = 4,
118    ) -> None:
119        """
120        Initializes the KScraper with a target folder, language, and processing mode.
121
122        Args:
123            folder: Folder partial where articles will be saved.
124            language: The language for Wikipedia articles.
125            mode: Processing mode
126                - `quick` (10-15k words)
127                - `balanced` (20-30k words)
128                - `comprehensive` (40-50k words)
129            maxWorkers: Number of concurrent fetch workers.
130        """
131        # Prepend absolute path in case specimen not run from 03 DrawBot/
132        self.folder = files.createFolder(
133            f"/Users/christianjansky/Library/CloudStorage/Dropbox/KOMETA-Draw/{folder.rstrip('/')}-{language}/"
134        )
135        """Full path to the file system folder where articles will be saved."""
136        self.mode = mode
137        """Processing mode for size estimation."""
138        self.language = language
139        """Language code used for Wikipedia queries and cache keys."""
140        self.maxWorkers = max(1, int(maxWorkers))
141        """Maximum number of concurrent workers used by fetchArticles."""
142        self.articles = {}
143        """Dict storing fetched articles as {slug: content}."""
144        self._titleCache = {}
145        """Per-instance cache for resolved page titles by requested slug."""
146        wikipedia.set_lang(language)
147
148    def _fetchCachePath(
149        self,
150        slugs: list[str],
151        targetWords: int | None,
152        maxWords: int | None,
153        minWordsPerArticle: int | None,
154    ) -> str:
155        """Return deterministic path for a fetch result cache file."""
156        payload = {
157            "language": self.language,
158            "mode": self.mode,
159            "slugs": sorted(slugs),
160            "targetWords": targetWords,
161            "maxWords": maxWords,
162            "minWordsPerArticle": minWordsPerArticle,
163        }
164        digest = hashlib.md5(
165            json.dumps(payload, sort_keys=True, ensure_ascii=True).encode("utf-8")
166        ).hexdigest()
167        return f"{self.folder}.fetch-cache-{digest}.json"
168
169    def flushCache(self, mode: CacheFlushMode = "all") -> dict[str, int | str]:
170        """
171        Flush scraper cache files from disk and clear in-memory cache state.
172
173        Args:
174            mode: Which cache files to remove:
175                - "fetch": Remove `.fetch-cache-*.json` files.
176                - "articles": Remove `wiki-*.txt` files.
177                - "all": Remove both cache types.
178
179        Returns:
180            Dict with deletion counters and selected mode.
181        """
182        deleted_fetch = 0
183        deleted_articles = 0
184
185        try:
186            folder_entries = os.listdir(self.folder)
187        except Exception as e:
188            logger.warning("Unable to list cache folder {}: {}", self.folder, e)
189            folder_entries = []
190
191        for entry in folder_entries:
192            if (
193                mode in ("fetch", "all")
194                and entry.startswith(".fetch-cache-")
195                and entry.endswith(".json")
196            ):
197                file_path = f"{self.folder}{entry}"
198                try:
199                    os.remove(file_path)
200                    deleted_fetch += 1
201                except Exception as e:
202                    logger.warning("Unable to delete fetch cache {}: {}", file_path, e)
203
204            if (
205                mode in ("articles", "all")
206                and entry.startswith("wiki-")
207                and entry.endswith(".txt")
208            ):
209                file_path = f"{self.folder}{entry}"
210                try:
211                    os.remove(file_path)
212                    deleted_articles += 1
213                except Exception as e:
214                    logger.warning(
215                        "Unable to delete article cache {}: {}", file_path, e
216                    )
217
218        self.articles = {}
219        self._titleCache = {}
220        self.metadata = {
221            "slugs": [],
222            "actualWords": 0,
223            "articleCount": 0,
224            "truncated": False,
225            "requestedArticles": 0,
226        }
227
228        result = {
229            "mode": mode,
230            "deletedFetchCaches": deleted_fetch,
231            "deletedArticleCaches": deleted_articles,
232        }
233        logger.info("Flushed cache: {}", result)
234        return result
235
236    def fetchArticle(self, slug: str) -> str:
237        """
238        Fetches a Wikipedia article by slug, saves it, and returns parsed content.
239
240        Args:
241            slug: The Wikipedia page slug.
242
243        Returns:
244            The parsed article content.
245        """
246
247        def _saveArticle(slug: str) -> str:
248            """
249            Saves the Wikipedia article as a .txt file and returns its content.
250
251            Args:
252                slug: The Wikipedia page slug.
253
254            Returns:
255                The saved article content.
256            """
257            logger.debug("Saving article for {}", slug)
258
259            max_retries = 4
260            for attempt in range(max_retries):
261                try:
262                    page = wikipedia.page(slug, auto_suggest=False)
263                    break
264                except (json.JSONDecodeError, ValueError) as e:
265                    if attempt < max_retries - 1:
266                        wait = 2**attempt
267                        logger.warning(
268                            "Empty/invalid response fetching {} (attempt {}/{}), retrying in {}s: {}",
269                            slug,
270                            attempt + 1,
271                            max_retries,
272                            wait,
273                            e,
274                        )
275                        time.sleep(wait)
276                    else:
277                        raise
278
279            with open(locateFile(slug), "w") as f:
280                pageClean = (
281                    KTextCleaner(page.content)
282                    .cleanWikipedia()
283                    .sanitizeForbidden()
284                    .removeLaTeX()
285                    .cleanPunctuation()
286                    .improvePunctuation()
287                    .cleanWhitespace()
288                    .get()
289                )
290                output = f"{page.url}\n\n{pageClean}"
291                f.write(output)
292                return output
293
294        def _parseArticle(input: str) -> str:
295            """
296            Parses the saved article content, removing URL and formatting.
297
298            Args:
299                input: The raw article content.
300
301            Returns:
302                The cleaned article content.
303            """
304            # Strip article URL from first line
305            output = re.sub(r"^http.+\n+", "", input)
306            # Strip == Headline == formatting
307            output = re.sub(r"(?<=\n)=+\s+|\s+=+(?=\n)", "", output)
308
309            return output
310
311        if slug in self._titleCache:
312            pageTitle = self._titleCache[slug]
313        else:
314            pageTitle = self._findWikiTitle(slug)
315            self._titleCache[slug] = pageTitle
316        locateFile = lambda pageSlug: (
317            f"{self.folder}wiki-{content.toKebabCase(pageSlug)}.txt"
318        )
319        filePath = locateFile(pageTitle)
320
321        if files.isFile(filePath):
322            result = files.readFile(filePath)
323        else:
324            try:
325                result = _saveArticle(pageTitle)
326            except Exception as e:
327                logger.warning("Unable to save {}: {}", pageTitle, e)
328                raise
329
330        return _parseArticle(result)
331
332    def fetchArticles(
333        self,
334        slugs: list[str],
335        targetWords: int = None,
336        maxWords: int = None,
337        minWordsPerArticle: int = None,
338    ) -> "KScraper":
339        """
340        Fetches and parses multiple Wikipedia articles with smart size management.
341        Stores articles in self.articles dict as {slug: content}.
342        Uses instance mode to automatically determine size parameters if not provided.
343
344        Args:
345            slugs: A list of Wikipedia page slugs.
346            targetWords: Target word count (distributes quota across articles). If None, uses mode-based estimate.
347            maxWords: Hard maximum word count (stops fetching when reached). If None, uses mode-based estimate.
348            minWordsPerArticle: Skip articles shorter than this. If None, uses mode-based estimate.
349
350        Returns self for chaining.
351        """
352        # Apply mode-based defaults if parameters not provided
353        if targetWords is None or maxWords is None or minWordsPerArticle is None:
354            params = self.estimateOptimalSize(len(slugs), self.mode)
355            targetWords = targetWords or params["targetWords"]
356            maxWords = maxWords or params["maxWords"]
357            minWordsPerArticle = (
358                minWordsPerArticle
359                if minWordsPerArticle is not None
360                else params["minWordsPerArticle"]
361            )
362
363        if not slugs:
364            self.articles = {}
365            self.metadata = {
366                "slugs": [],
367                "actualWords": 0,
368                "articleCount": 0,
369                "truncated": False,
370                "requestedArticles": 0,
371            }
372            return self
373
374        uniqueSlugs = list(dict.fromkeys(slugs))
375
376        cachePath = self._fetchCachePath(
377            slugs, targetWords, maxWords, minWordsPerArticle
378        )
379        if files.isFile(cachePath):
380            try:
381                payload = files.readFile(cachePath, mode="json")
382                cachedArticles = payload.get("articles", {})
383                self.articles = {
384                    slug: cachedArticles[slug]
385                    for slug in slugs
386                    if slug in cachedArticles
387                }
388                self.metadata = payload.get("metadata", {})
389                logger.trace("Loaded fetch cache: {}", cachePath)
390                return self
391            except Exception as e:
392                logger.warning("Invalid fetch cache {}: {}", cachePath, e)
393
394        fetched_slugs = []
395        total_words = 0
396        truncated = False
397        articles_fetched = 0
398
399        # Calculate per-article quota if target is specified
400        quota_per_article = targetWords // len(slugs) if targetWords and slugs else None
401
402        rawResults = {}
403        with ThreadPoolExecutor(max_workers=self.maxWorkers) as executor:
404            futures = {
405                executor.submit(self.fetchArticle, slug): slug for slug in uniqueSlugs
406            }
407            for future in as_completed(futures):
408                slug = futures[future]
409                try:
410                    rawResults[slug] = future.result()
411                except Exception as e:
412                    logger.warning(f"Failed to fetch {slug}: {e}")
413
414        for slug in slugs:
415            # Stop if we've hit the hard maximum
416            if maxWords and total_words >= maxWords:
417                logger.info(f"Reached maxWords limit ({maxWords}), stopping fetch")
418                truncated = True
419                break
420
421            article_text = rawResults.get(slug)
422            if not article_text:
423                continue
424
425            word_count = len(article_text.split())
426
427            # Skip articles that are too short
428            if word_count < minWordsPerArticle:
429                logger.debug(
430                    f"Skipping {slug} ({word_count} words < {minWordsPerArticle} minimum)"
431                )
432                continue
433
434            # Apply quota-based truncation if target is set
435            if quota_per_article and word_count > quota_per_article:
436                article_text = self._truncateAtSentence(article_text, quota_per_article)
437                word_count = len(article_text.split())
438                truncated = True
439                logger.trace(f"Truncated {slug} to ~{quota_per_article} words")
440
441            # Apply hard max truncation
442            remaining_quota = maxWords - total_words if maxWords else None
443            if remaining_quota and word_count > remaining_quota:
444                article_text = self._truncateAtSentence(article_text, remaining_quota)
445                word_count = len(article_text.split())
446                truncated = True
447                logger.trace(
448                    f"Truncated {slug} to fit remaining quota ({remaining_quota} words)"
449                )
450
451            # Store in articles dict
452            self.articles[slug] = article_text
453            fetched_slugs.append(slug)
454            total_words += word_count
455            articles_fetched += 1
456
457            logger.trace(f"Fetched {slug}: {word_count} words (total: {total_words})")
458
459        self.metadata = {
460            "slugs": fetched_slugs,
461            "actualWords": total_words,
462            "articleCount": articles_fetched,
463            "truncated": truncated,
464            "requestedArticles": len(slugs),
465        }
466        logger.trace(self.metadata)
467
468        try:
469            with open(cachePath, "w", encoding="utf-8") as f:
470                json.dump(
471                    {"articles": self.articles, "metadata": self.metadata},
472                    f,
473                    ensure_ascii=False,
474                )
475        except Exception as e:
476            logger.warning("Unable to write fetch cache {}: {}", cachePath, e)
477
478        return self
479
480    def _truncateAtSentence(self, text: str, target_words: int) -> str:
481        """
482        Truncate text at sentence boundary closest to target word count.
483
484        Args:
485            text: Text to truncate.
486            target_words: Target word count.
487
488        Returns:
489            Truncated text ending at a sentence boundary.
490        """
491        sentences = self._splitSentences(text)
492        result = []
493        word_count = 0
494
495        for sentence in sentences:
496            sentence_words = len(sentence.split())
497            if word_count + sentence_words > target_words:
498                break
499            result.append(sentence)
500            word_count += sentence_words
501
502        return " ".join(result)
503
504    def compileArticles(self, slugs: list[str] = None, separator: str = "\n\n") -> str:
505        """
506        Compile fetched articles into a single concatenated string.
507
508        Args:
509            slugs: List of specific slugs to compile. If None, compiles all stored articles.
510            separator: String to join articles with (default: "\\n\\n").
511
512        Returns:
513            Concatenated article content as a single string.
514        """
515        if slugs is None:
516            # Compile all articles
517            articles_to_compile = list(self.articles.values())
518        else:
519            # Compile only specified slugs
520            articles_to_compile = [
521                self.articles[slug] for slug in slugs if slug in self.articles
522            ]
523
524        if not articles_to_compile:
525            raise ValueError(
526                "No articles to compile. Check if slugs are correct and articles are  fetched."
527            )
528        return separator.join(articles_to_compile)
KScraper( folder='01 Content/articles', language='en', mode: Literal['quick', 'balanced', 'comprehensive'] = 'quick', maxWorkers: int = 4)
112    def __init__(
113        self,
114        folder="01 Content/articles",
115        language="en",
116        mode: ScraperMode = "quick",
117        maxWorkers: int = 4,
118    ) -> None:
119        """
120        Initializes the KScraper with a target folder, language, and processing mode.
121
122        Args:
123            folder: Folder partial where articles will be saved.
124            language: The language for Wikipedia articles.
125            mode: Processing mode
126                - `quick` (10-15k words)
127                - `balanced` (20-30k words)
128                - `comprehensive` (40-50k words)
129            maxWorkers: Number of concurrent fetch workers.
130        """
131        # Prepend absolute path in case specimen not run from 03 DrawBot/
132        self.folder = files.createFolder(
133            f"/Users/christianjansky/Library/CloudStorage/Dropbox/KOMETA-Draw/{folder.rstrip('/')}-{language}/"
134        )
135        """Full path to the file system folder where articles will be saved."""
136        self.mode = mode
137        """Processing mode for size estimation."""
138        self.language = language
139        """Language code used for Wikipedia queries and cache keys."""
140        self.maxWorkers = max(1, int(maxWorkers))
141        """Maximum number of concurrent workers used by fetchArticles."""
142        self.articles = {}
143        """Dict storing fetched articles as {slug: content}."""
144        self._titleCache = {}
145        """Per-instance cache for resolved page titles by requested slug."""
146        wikipedia.set_lang(language)

Initializes the KScraper with a target folder, language, and processing mode.

Arguments:
  • folder: Folder partial where articles will be saved.
  • language: The language for Wikipedia articles.
  • mode: Processing mode
    • quick (10-15k words)
    • balanced (20-30k words)
    • comprehensive (40-50k words)
  • maxWorkers: Number of concurrent fetch workers.
ScraperMode = typing.Literal['quick', 'balanced', 'comprehensive']
metadata: dict

Metadata about the last fetchArticles call.

@staticmethod
def estimateOptimalSize( article_count: int, mode: Literal['quick', 'balanced', 'comprehensive'] = 'quick') -> dict:
23    @staticmethod
24    def estimateOptimalSize(article_count: int, mode: ScraperMode = "quick") -> dict:
25        """
26        Recommends optimal text size parameters based on article count and use case.
27
28        Args:
29            article_count: Number of articles to fetch.
30            mode: Processing mode indicating desired size and depth of content.
31
32        Returns:
33            Dict with recommended 'targetWords', 'maxWords', and 'minWordsPerArticle'.
34        """
35        modes = {
36            "quick": {"base": 10000, "max": 15000, "min_per": 250},
37            "balanced": {"base": 20000, "max": 30000, "min_per": 300},
38            "comprehensive": {"base": 40000, "max": 50000, "min_per": 500},
39        }
40
41        config = modes.get(mode, modes["quick"])
42
43        # Adjust target based on article count
44        # More articles = higher target (but capped at max)
45        target = min(config["base"] * (1 + article_count // 5), config["max"])
46
47        return {
48            "targetWords": target,
49            "maxWords": config["max"],
50            "minWordsPerArticle": config["min_per"],
51        }

Recommends optimal text size parameters based on article count and use case.

Arguments:
  • article_count: Number of articles to fetch.
  • mode: Processing mode indicating desired size and depth of content.
Returns:

Dict with recommended 'targetWords', 'maxWords', and 'minWordsPerArticle'.

folder

Full path to the file system folder where articles will be saved.

mode

Processing mode for size estimation.

language

Language code used for Wikipedia queries and cache keys.

maxWorkers

Maximum number of concurrent workers used by fetchArticles.

articles

Dict storing fetched articles as {slug: content}.

def flushCache( self, mode: Literal['fetch', 'articles', 'all'] = 'all') -> dict[str, int | str]:
169    def flushCache(self, mode: CacheFlushMode = "all") -> dict[str, int | str]:
170        """
171        Flush scraper cache files from disk and clear in-memory cache state.
172
173        Args:
174            mode: Which cache files to remove:
175                - "fetch": Remove `.fetch-cache-*.json` files.
176                - "articles": Remove `wiki-*.txt` files.
177                - "all": Remove both cache types.
178
179        Returns:
180            Dict with deletion counters and selected mode.
181        """
182        deleted_fetch = 0
183        deleted_articles = 0
184
185        try:
186            folder_entries = os.listdir(self.folder)
187        except Exception as e:
188            logger.warning("Unable to list cache folder {}: {}", self.folder, e)
189            folder_entries = []
190
191        for entry in folder_entries:
192            if (
193                mode in ("fetch", "all")
194                and entry.startswith(".fetch-cache-")
195                and entry.endswith(".json")
196            ):
197                file_path = f"{self.folder}{entry}"
198                try:
199                    os.remove(file_path)
200                    deleted_fetch += 1
201                except Exception as e:
202                    logger.warning("Unable to delete fetch cache {}: {}", file_path, e)
203
204            if (
205                mode in ("articles", "all")
206                and entry.startswith("wiki-")
207                and entry.endswith(".txt")
208            ):
209                file_path = f"{self.folder}{entry}"
210                try:
211                    os.remove(file_path)
212                    deleted_articles += 1
213                except Exception as e:
214                    logger.warning(
215                        "Unable to delete article cache {}: {}", file_path, e
216                    )
217
218        self.articles = {}
219        self._titleCache = {}
220        self.metadata = {
221            "slugs": [],
222            "actualWords": 0,
223            "articleCount": 0,
224            "truncated": False,
225            "requestedArticles": 0,
226        }
227
228        result = {
229            "mode": mode,
230            "deletedFetchCaches": deleted_fetch,
231            "deletedArticleCaches": deleted_articles,
232        }
233        logger.info("Flushed cache: {}", result)
234        return result

Flush scraper cache files from disk and clear in-memory cache state.

Arguments:
  • mode: Which cache files to remove:
    • "fetch": Remove .fetch-cache-*.json files.
    • "articles": Remove wiki-*.txt files.
    • "all": Remove both cache types.
Returns:

Dict with deletion counters and selected mode.

def fetchArticle(self, slug: str) -> str:
236    def fetchArticle(self, slug: str) -> str:
237        """
238        Fetches a Wikipedia article by slug, saves it, and returns parsed content.
239
240        Args:
241            slug: The Wikipedia page slug.
242
243        Returns:
244            The parsed article content.
245        """
246
247        def _saveArticle(slug: str) -> str:
248            """
249            Saves the Wikipedia article as a .txt file and returns its content.
250
251            Args:
252                slug: The Wikipedia page slug.
253
254            Returns:
255                The saved article content.
256            """
257            logger.debug("Saving article for {}", slug)
258
259            max_retries = 4
260            for attempt in range(max_retries):
261                try:
262                    page = wikipedia.page(slug, auto_suggest=False)
263                    break
264                except (json.JSONDecodeError, ValueError) as e:
265                    if attempt < max_retries - 1:
266                        wait = 2**attempt
267                        logger.warning(
268                            "Empty/invalid response fetching {} (attempt {}/{}), retrying in {}s: {}",
269                            slug,
270                            attempt + 1,
271                            max_retries,
272                            wait,
273                            e,
274                        )
275                        time.sleep(wait)
276                    else:
277                        raise
278
279            with open(locateFile(slug), "w") as f:
280                pageClean = (
281                    KTextCleaner(page.content)
282                    .cleanWikipedia()
283                    .sanitizeForbidden()
284                    .removeLaTeX()
285                    .cleanPunctuation()
286                    .improvePunctuation()
287                    .cleanWhitespace()
288                    .get()
289                )
290                output = f"{page.url}\n\n{pageClean}"
291                f.write(output)
292                return output
293
294        def _parseArticle(input: str) -> str:
295            """
296            Parses the saved article content, removing URL and formatting.
297
298            Args:
299                input: The raw article content.
300
301            Returns:
302                The cleaned article content.
303            """
304            # Strip article URL from first line
305            output = re.sub(r"^http.+\n+", "", input)
306            # Strip == Headline == formatting
307            output = re.sub(r"(?<=\n)=+\s+|\s+=+(?=\n)", "", output)
308
309            return output
310
311        if slug in self._titleCache:
312            pageTitle = self._titleCache[slug]
313        else:
314            pageTitle = self._findWikiTitle(slug)
315            self._titleCache[slug] = pageTitle
316        locateFile = lambda pageSlug: (
317            f"{self.folder}wiki-{content.toKebabCase(pageSlug)}.txt"
318        )
319        filePath = locateFile(pageTitle)
320
321        if files.isFile(filePath):
322            result = files.readFile(filePath)
323        else:
324            try:
325                result = _saveArticle(pageTitle)
326            except Exception as e:
327                logger.warning("Unable to save {}: {}", pageTitle, e)
328                raise
329
330        return _parseArticle(result)

Fetches a Wikipedia article by slug, saves it, and returns parsed content.

Arguments:
  • slug: The Wikipedia page slug.
Returns:

The parsed article content.

def fetchArticles( self, slugs: list[str], targetWords: int = None, maxWords: int = None, minWordsPerArticle: int = None) -> KScraper:
332    def fetchArticles(
333        self,
334        slugs: list[str],
335        targetWords: int = None,
336        maxWords: int = None,
337        minWordsPerArticle: int = None,
338    ) -> "KScraper":
339        """
340        Fetches and parses multiple Wikipedia articles with smart size management.
341        Stores articles in self.articles dict as {slug: content}.
342        Uses instance mode to automatically determine size parameters if not provided.
343
344        Args:
345            slugs: A list of Wikipedia page slugs.
346            targetWords: Target word count (distributes quota across articles). If None, uses mode-based estimate.
347            maxWords: Hard maximum word count (stops fetching when reached). If None, uses mode-based estimate.
348            minWordsPerArticle: Skip articles shorter than this. If None, uses mode-based estimate.
349
350        Returns self for chaining.
351        """
352        # Apply mode-based defaults if parameters not provided
353        if targetWords is None or maxWords is None or minWordsPerArticle is None:
354            params = self.estimateOptimalSize(len(slugs), self.mode)
355            targetWords = targetWords or params["targetWords"]
356            maxWords = maxWords or params["maxWords"]
357            minWordsPerArticle = (
358                minWordsPerArticle
359                if minWordsPerArticle is not None
360                else params["minWordsPerArticle"]
361            )
362
363        if not slugs:
364            self.articles = {}
365            self.metadata = {
366                "slugs": [],
367                "actualWords": 0,
368                "articleCount": 0,
369                "truncated": False,
370                "requestedArticles": 0,
371            }
372            return self
373
374        uniqueSlugs = list(dict.fromkeys(slugs))
375
376        cachePath = self._fetchCachePath(
377            slugs, targetWords, maxWords, minWordsPerArticle
378        )
379        if files.isFile(cachePath):
380            try:
381                payload = files.readFile(cachePath, mode="json")
382                cachedArticles = payload.get("articles", {})
383                self.articles = {
384                    slug: cachedArticles[slug]
385                    for slug in slugs
386                    if slug in cachedArticles
387                }
388                self.metadata = payload.get("metadata", {})
389                logger.trace("Loaded fetch cache: {}", cachePath)
390                return self
391            except Exception as e:
392                logger.warning("Invalid fetch cache {}: {}", cachePath, e)
393
394        fetched_slugs = []
395        total_words = 0
396        truncated = False
397        articles_fetched = 0
398
399        # Calculate per-article quota if target is specified
400        quota_per_article = targetWords // len(slugs) if targetWords and slugs else None
401
402        rawResults = {}
403        with ThreadPoolExecutor(max_workers=self.maxWorkers) as executor:
404            futures = {
405                executor.submit(self.fetchArticle, slug): slug for slug in uniqueSlugs
406            }
407            for future in as_completed(futures):
408                slug = futures[future]
409                try:
410                    rawResults[slug] = future.result()
411                except Exception as e:
412                    logger.warning(f"Failed to fetch {slug}: {e}")
413
414        for slug in slugs:
415            # Stop if we've hit the hard maximum
416            if maxWords and total_words >= maxWords:
417                logger.info(f"Reached maxWords limit ({maxWords}), stopping fetch")
418                truncated = True
419                break
420
421            article_text = rawResults.get(slug)
422            if not article_text:
423                continue
424
425            word_count = len(article_text.split())
426
427            # Skip articles that are too short
428            if word_count < minWordsPerArticle:
429                logger.debug(
430                    f"Skipping {slug} ({word_count} words < {minWordsPerArticle} minimum)"
431                )
432                continue
433
434            # Apply quota-based truncation if target is set
435            if quota_per_article and word_count > quota_per_article:
436                article_text = self._truncateAtSentence(article_text, quota_per_article)
437                word_count = len(article_text.split())
438                truncated = True
439                logger.trace(f"Truncated {slug} to ~{quota_per_article} words")
440
441            # Apply hard max truncation
442            remaining_quota = maxWords - total_words if maxWords else None
443            if remaining_quota and word_count > remaining_quota:
444                article_text = self._truncateAtSentence(article_text, remaining_quota)
445                word_count = len(article_text.split())
446                truncated = True
447                logger.trace(
448                    f"Truncated {slug} to fit remaining quota ({remaining_quota} words)"
449                )
450
451            # Store in articles dict
452            self.articles[slug] = article_text
453            fetched_slugs.append(slug)
454            total_words += word_count
455            articles_fetched += 1
456
457            logger.trace(f"Fetched {slug}: {word_count} words (total: {total_words})")
458
459        self.metadata = {
460            "slugs": fetched_slugs,
461            "actualWords": total_words,
462            "articleCount": articles_fetched,
463            "truncated": truncated,
464            "requestedArticles": len(slugs),
465        }
466        logger.trace(self.metadata)
467
468        try:
469            with open(cachePath, "w", encoding="utf-8") as f:
470                json.dump(
471                    {"articles": self.articles, "metadata": self.metadata},
472                    f,
473                    ensure_ascii=False,
474                )
475        except Exception as e:
476            logger.warning("Unable to write fetch cache {}: {}", cachePath, e)
477
478        return self

Fetches and parses multiple Wikipedia articles with smart size management. Stores articles in self.articles dict as {slug: content}. Uses instance mode to automatically determine size parameters if not provided.

Arguments:
  • slugs: A list of Wikipedia page slugs.
  • targetWords: Target word count (distributes quota across articles). If None, uses mode-based estimate.
  • maxWords: Hard maximum word count (stops fetching when reached). If None, uses mode-based estimate.
  • minWordsPerArticle: Skip articles shorter than this. If None, uses mode-based estimate.

Returns self for chaining.

def compileArticles(self, slugs: list[str] = None, separator: str = '\n\n') -> str:
504    def compileArticles(self, slugs: list[str] = None, separator: str = "\n\n") -> str:
505        """
506        Compile fetched articles into a single concatenated string.
507
508        Args:
509            slugs: List of specific slugs to compile. If None, compiles all stored articles.
510            separator: String to join articles with (default: "\\n\\n").
511
512        Returns:
513            Concatenated article content as a single string.
514        """
515        if slugs is None:
516            # Compile all articles
517            articles_to_compile = list(self.articles.values())
518        else:
519            # Compile only specified slugs
520            articles_to_compile = [
521                self.articles[slug] for slug in slugs if slug in self.articles
522            ]
523
524        if not articles_to_compile:
525            raise ValueError(
526                "No articles to compile. Check if slugs are correct and articles are  fetched."
527            )
528        return separator.join(articles_to_compile)

Compile fetched articles into a single concatenated string.

Arguments:
  • slugs: List of specific slugs to compile. If None, compiles all stored articles.
  • separator: String to join articles with (default: "\n\n").
Returns:

Concatenated article content as a single string.