classes.c30_scraper
1import wikipedia 2import re 3from loguru import logger 4 5from lib import helpers, content 6 7 8class KScraper: 9 def __init__(self, folder="01 Content/articles", language="en") -> None: 10 """ 11 Initializes the KScraper with a target folder and language. 12 13 Args: 14 folder: Folder partial where articles will be saved. 15 language: The language for Wikipedia articles. 16 """ 17 # Prepend absolute path in case specimen not run from 03 DrawBot/ 18 self.folder = helpers.createFolder( 19 f"/Users/christianjansky/Library/CloudStorage/Dropbox/KOMETA-Work/40 Scripts/03 DrawBot/{folder.rstrip('/')}-{language}/" 20 ) 21 """Full path to the file system folder where articles will be saved.""" 22 wikipedia.set_lang(language) 23 24 def fetchArticle(self, slug: str) -> str: 25 """ 26 Fetches a Wikipedia article by slug, saves it, and returns parsed content. 27 28 Args: 29 slug: The Wikipedia page slug. 30 31 Returns: 32 The parsed article content. 33 """ 34 35 def _findWikiTitle(slug: str) -> str: 36 """ 37 Finds a valid Wikipedia page title or its closest match. 38 39 Args: 40 slug: The Wikipedia page slug. 41 42 Returns: 43 The resolved Wikipedia page title. 44 """ 45 try: 46 try: 47 # Find exact match 48 page = wikipedia.page(slug, auto_suggest=False) 49 except wikipedia.exceptions.PageError as e: 50 # Correct spelling error 51 suggested = wikipedia.suggest(e.pageid) 52 logger.warning("Correcting {} to {}", e.pageid, suggested) 53 page = wikipedia.page(suggested, auto_suggest=False) 54 except wikipedia.exceptions.DisambiguationError as e: 55 # Pick first possible match if it may refer to multiple pages 56 logger.warning("Possible pages for {}: {}", slug, e.options) 57 page = wikipedia.page(helpers.pickFirst(e.options), auto_suggest=False) 58 59 return page.title 60 61 def _fixPageContent(input: str) -> str: 62 """ 63 Fixes collapsed spacing in Wikipedia content. 64 65 Args: 66 input: The raw Wikipedia content. 67 68 Returns: 69 The fixed content with proper spacing. 70 """ 71 output = re.sub(r"([a-z0-9]\.)([A-Z])", r"\1 \2", input) 72 return output 73 74 def _saveArticle(slug: str) -> str: 75 """ 76 Saves the Wikipedia article as a .txt file and returns its content. 77 78 Args: 79 slug: The Wikipedia page slug. 80 81 Returns: 82 The saved article content. 83 """ 84 logger.debug("Saving article for {}", slug) 85 with open(locateFile(slug), "w") as file: 86 page = wikipedia.page(slug, auto_suggest=False) 87 output = f"{page.url}\n\n{_fixPageContent(page.content)}" 88 file.write(output) 89 return output 90 91 def _parseArticle(input: str) -> str: 92 """ 93 Parses the saved article content, removing URL and formatting. 94 95 Args: 96 input: The raw article content. 97 98 Returns: 99 The cleaned article content. 100 """ 101 # Strip article URL from first line 102 output = re.sub(r"^http.+\n+", "", input) 103 # Strip == Headline == formatting 104 output = re.sub(r"(?<=\n)=+\s+|\s+=+(?=\n)", "", output) 105 106 return output 107 108 pageTitle = _findWikiTitle(slug) 109 locateFile = ( 110 lambda pageSlug: f"{self.folder}wiki-{content.toKebabCase(pageSlug)}.txt" 111 ) 112 filePath = locateFile(pageTitle) 113 114 if helpers.isFile(filePath): 115 result = helpers.readFile(filePath) 116 else: 117 try: 118 result = _saveArticle(pageTitle) 119 except Exception as e: 120 logger.warning("Unable to save {}: {}", pageTitle, e) 121 122 return _parseArticle(result) 123 124 def fetchArticles(self, slugs: list[str]) -> str: 125 """ 126 Fetches and parses multiple Wikipedia articles. 127 128 Args: 129 slugs: A list of Wikipedia page slugs. 130 131 Returns: 132 The concatenated content of all fetched articles. 133 """ 134 return "\n".join(map(self.fetchArticle, slugs))
class
KScraper:
9class KScraper: 10 def __init__(self, folder="01 Content/articles", language="en") -> None: 11 """ 12 Initializes the KScraper with a target folder and language. 13 14 Args: 15 folder: Folder partial where articles will be saved. 16 language: The language for Wikipedia articles. 17 """ 18 # Prepend absolute path in case specimen not run from 03 DrawBot/ 19 self.folder = helpers.createFolder( 20 f"/Users/christianjansky/Library/CloudStorage/Dropbox/KOMETA-Work/40 Scripts/03 DrawBot/{folder.rstrip('/')}-{language}/" 21 ) 22 """Full path to the file system folder where articles will be saved.""" 23 wikipedia.set_lang(language) 24 25 def fetchArticle(self, slug: str) -> str: 26 """ 27 Fetches a Wikipedia article by slug, saves it, and returns parsed content. 28 29 Args: 30 slug: The Wikipedia page slug. 31 32 Returns: 33 The parsed article content. 34 """ 35 36 def _findWikiTitle(slug: str) -> str: 37 """ 38 Finds a valid Wikipedia page title or its closest match. 39 40 Args: 41 slug: The Wikipedia page slug. 42 43 Returns: 44 The resolved Wikipedia page title. 45 """ 46 try: 47 try: 48 # Find exact match 49 page = wikipedia.page(slug, auto_suggest=False) 50 except wikipedia.exceptions.PageError as e: 51 # Correct spelling error 52 suggested = wikipedia.suggest(e.pageid) 53 logger.warning("Correcting {} to {}", e.pageid, suggested) 54 page = wikipedia.page(suggested, auto_suggest=False) 55 except wikipedia.exceptions.DisambiguationError as e: 56 # Pick first possible match if it may refer to multiple pages 57 logger.warning("Possible pages for {}: {}", slug, e.options) 58 page = wikipedia.page(helpers.pickFirst(e.options), auto_suggest=False) 59 60 return page.title 61 62 def _fixPageContent(input: str) -> str: 63 """ 64 Fixes collapsed spacing in Wikipedia content. 65 66 Args: 67 input: The raw Wikipedia content. 68 69 Returns: 70 The fixed content with proper spacing. 71 """ 72 output = re.sub(r"([a-z0-9]\.)([A-Z])", r"\1 \2", input) 73 return output 74 75 def _saveArticle(slug: str) -> str: 76 """ 77 Saves the Wikipedia article as a .txt file and returns its content. 78 79 Args: 80 slug: The Wikipedia page slug. 81 82 Returns: 83 The saved article content. 84 """ 85 logger.debug("Saving article for {}", slug) 86 with open(locateFile(slug), "w") as file: 87 page = wikipedia.page(slug, auto_suggest=False) 88 output = f"{page.url}\n\n{_fixPageContent(page.content)}" 89 file.write(output) 90 return output 91 92 def _parseArticle(input: str) -> str: 93 """ 94 Parses the saved article content, removing URL and formatting. 95 96 Args: 97 input: The raw article content. 98 99 Returns: 100 The cleaned article content. 101 """ 102 # Strip article URL from first line 103 output = re.sub(r"^http.+\n+", "", input) 104 # Strip == Headline == formatting 105 output = re.sub(r"(?<=\n)=+\s+|\s+=+(?=\n)", "", output) 106 107 return output 108 109 pageTitle = _findWikiTitle(slug) 110 locateFile = ( 111 lambda pageSlug: f"{self.folder}wiki-{content.toKebabCase(pageSlug)}.txt" 112 ) 113 filePath = locateFile(pageTitle) 114 115 if helpers.isFile(filePath): 116 result = helpers.readFile(filePath) 117 else: 118 try: 119 result = _saveArticle(pageTitle) 120 except Exception as e: 121 logger.warning("Unable to save {}: {}", pageTitle, e) 122 123 return _parseArticle(result) 124 125 def fetchArticles(self, slugs: list[str]) -> str: 126 """ 127 Fetches and parses multiple Wikipedia articles. 128 129 Args: 130 slugs: A list of Wikipedia page slugs. 131 132 Returns: 133 The concatenated content of all fetched articles. 134 """ 135 return "\n".join(map(self.fetchArticle, slugs))
KScraper(folder='01 Content/articles', language='en')
10 def __init__(self, folder="01 Content/articles", language="en") -> None: 11 """ 12 Initializes the KScraper with a target folder and language. 13 14 Args: 15 folder: Folder partial where articles will be saved. 16 language: The language for Wikipedia articles. 17 """ 18 # Prepend absolute path in case specimen not run from 03 DrawBot/ 19 self.folder = helpers.createFolder( 20 f"/Users/christianjansky/Library/CloudStorage/Dropbox/KOMETA-Work/40 Scripts/03 DrawBot/{folder.rstrip('/')}-{language}/" 21 ) 22 """Full path to the file system folder where articles will be saved.""" 23 wikipedia.set_lang(language)
Initializes the KScraper with a target folder and language.
Arguments:
- folder: Folder partial where articles will be saved.
- language: The language for Wikipedia articles.
def
fetchArticle(self, slug: str) -> str:
25 def fetchArticle(self, slug: str) -> str: 26 """ 27 Fetches a Wikipedia article by slug, saves it, and returns parsed content. 28 29 Args: 30 slug: The Wikipedia page slug. 31 32 Returns: 33 The parsed article content. 34 """ 35 36 def _findWikiTitle(slug: str) -> str: 37 """ 38 Finds a valid Wikipedia page title or its closest match. 39 40 Args: 41 slug: The Wikipedia page slug. 42 43 Returns: 44 The resolved Wikipedia page title. 45 """ 46 try: 47 try: 48 # Find exact match 49 page = wikipedia.page(slug, auto_suggest=False) 50 except wikipedia.exceptions.PageError as e: 51 # Correct spelling error 52 suggested = wikipedia.suggest(e.pageid) 53 logger.warning("Correcting {} to {}", e.pageid, suggested) 54 page = wikipedia.page(suggested, auto_suggest=False) 55 except wikipedia.exceptions.DisambiguationError as e: 56 # Pick first possible match if it may refer to multiple pages 57 logger.warning("Possible pages for {}: {}", slug, e.options) 58 page = wikipedia.page(helpers.pickFirst(e.options), auto_suggest=False) 59 60 return page.title 61 62 def _fixPageContent(input: str) -> str: 63 """ 64 Fixes collapsed spacing in Wikipedia content. 65 66 Args: 67 input: The raw Wikipedia content. 68 69 Returns: 70 The fixed content with proper spacing. 71 """ 72 output = re.sub(r"([a-z0-9]\.)([A-Z])", r"\1 \2", input) 73 return output 74 75 def _saveArticle(slug: str) -> str: 76 """ 77 Saves the Wikipedia article as a .txt file and returns its content. 78 79 Args: 80 slug: The Wikipedia page slug. 81 82 Returns: 83 The saved article content. 84 """ 85 logger.debug("Saving article for {}", slug) 86 with open(locateFile(slug), "w") as file: 87 page = wikipedia.page(slug, auto_suggest=False) 88 output = f"{page.url}\n\n{_fixPageContent(page.content)}" 89 file.write(output) 90 return output 91 92 def _parseArticle(input: str) -> str: 93 """ 94 Parses the saved article content, removing URL and formatting. 95 96 Args: 97 input: The raw article content. 98 99 Returns: 100 The cleaned article content. 101 """ 102 # Strip article URL from first line 103 output = re.sub(r"^http.+\n+", "", input) 104 # Strip == Headline == formatting 105 output = re.sub(r"(?<=\n)=+\s+|\s+=+(?=\n)", "", output) 106 107 return output 108 109 pageTitle = _findWikiTitle(slug) 110 locateFile = ( 111 lambda pageSlug: f"{self.folder}wiki-{content.toKebabCase(pageSlug)}.txt" 112 ) 113 filePath = locateFile(pageTitle) 114 115 if helpers.isFile(filePath): 116 result = helpers.readFile(filePath) 117 else: 118 try: 119 result = _saveArticle(pageTitle) 120 except Exception as e: 121 logger.warning("Unable to save {}: {}", pageTitle, e) 122 123 return _parseArticle(result)
Fetches a Wikipedia article by slug, saves it, and returns parsed content.
Arguments:
- slug: The Wikipedia page slug.
Returns:
The parsed article content.
def
fetchArticles(self, slugs: list[str]) -> str:
125 def fetchArticles(self, slugs: list[str]) -> str: 126 """ 127 Fetches and parses multiple Wikipedia articles. 128 129 Args: 130 slugs: A list of Wikipedia page slugs. 131 132 Returns: 133 The concatenated content of all fetched articles. 134 """ 135 return "\n".join(map(self.fetchArticle, slugs))
Fetches and parses multiple Wikipedia articles.
Arguments:
- slugs: A list of Wikipedia page slugs.
Returns:
The concatenated content of all fetched articles.