classes.c30_scraper

View Source

  1import wikipedia
  2import re
  3from loguru import logger
  4
  5from lib import helpers, content
  6
  7
  8class KScraper:
  9    def __init__(self, folder="01 Content/articles", language="en") -> None:
 10        """
 11        Initializes the KScraper with a target folder and language.
 12
 13        Args:
 14            folder: Folder partial where articles will be saved.
 15            language: The language for Wikipedia articles.
 16        """
 17        # Prepend absolute path in case specimen not run from 03 DrawBot/
 18        self.folder = helpers.createFolder(
 19            f"/Users/christianjansky/Library/CloudStorage/Dropbox/KOMETA-Work/40 Scripts/03 DrawBot/{folder.rstrip('/')}-{language}/"
 20        )
 21        """Full path to the file system folder where articles will be saved."""
 22        wikipedia.set_lang(language)
 23
 24    def fetchArticle(self, slug: str) -> str:
 25        """
 26        Fetches a Wikipedia article by slug, saves it, and returns parsed content.
 27
 28        Args:
 29            slug: The Wikipedia page slug.
 30
 31        Returns:
 32            The parsed article content.
 33        """
 34
 35        def _findWikiTitle(slug: str) -> str:
 36            """
 37            Finds a valid Wikipedia page title or its closest match.
 38
 39            Args:
 40                slug: The Wikipedia page slug.
 41
 42            Returns:
 43                The resolved Wikipedia page title.
 44            """
 45            try:
 46                try:
 47                    # Find exact match
 48                    page = wikipedia.page(slug, auto_suggest=False)
 49                except wikipedia.exceptions.PageError as e:
 50                    # Correct spelling error
 51                    suggested = wikipedia.suggest(e.pageid)
 52                    logger.warning("Correcting {} to {}", e.pageid, suggested)
 53                    page = wikipedia.page(suggested, auto_suggest=False)
 54            except wikipedia.exceptions.DisambiguationError as e:
 55                # Pick first possible match if it may refer to multiple pages
 56                logger.warning("Possible pages for {}: {}", slug, e.options)
 57                page = wikipedia.page(helpers.pickFirst(e.options), auto_suggest=False)
 58
 59            return page.title
 60
 61        def _fixPageContent(input: str) -> str:
 62            """
 63            Fixes collapsed spacing in Wikipedia content.
 64
 65            Args:
 66                input: The raw Wikipedia content.
 67
 68            Returns:
 69                The fixed content with proper spacing.
 70            """
 71            output = re.sub(r"([a-z0-9]\.)([A-Z])", r"\1 \2", input)
 72            return output
 73
 74        def _saveArticle(slug: str) -> str:
 75            """
 76            Saves the Wikipedia article as a .txt file and returns its content.
 77
 78            Args:
 79                slug: The Wikipedia page slug.
 80
 81            Returns:
 82                The saved article content.
 83            """
 84            logger.debug("Saving article for {}", slug)
 85            with open(locateFile(slug), "w") as file:
 86                page = wikipedia.page(slug, auto_suggest=False)
 87                output = f"{page.url}\n\n{_fixPageContent(page.content)}"
 88                file.write(output)
 89                return output
 90
 91        def _parseArticle(input: str) -> str:
 92            """
 93            Parses the saved article content, removing URL and formatting.
 94
 95            Args:
 96                input: The raw article content.
 97
 98            Returns:
 99                The cleaned article content.
100            """
101            # Strip article URL from first line
102            output = re.sub(r"^http.+\n+", "", input)
103            # Strip == Headline == formatting
104            output = re.sub(r"(?<=\n)=+\s+|\s+=+(?=\n)", "", output)
105
106            return output
107
108        pageTitle = _findWikiTitle(slug)
109        locateFile = (
110            lambda pageSlug: f"{self.folder}wiki-{content.toKebabCase(pageSlug)}.txt"
111        )
112        filePath = locateFile(pageTitle)
113
114        if helpers.isFile(filePath):
115            result = helpers.readFile(filePath)
116        else:
117            try:
118                result = _saveArticle(pageTitle)
119            except Exception as e:
120                logger.warning("Unable to save {}: {}", pageTitle, e)
121
122        return _parseArticle(result)
123
124    def fetchArticles(self, slugs: list[str]) -> str:
125        """
126        Fetches and parses multiple Wikipedia articles.
127
128        Args:
129            slugs: A list of Wikipedia page slugs.
130
131        Returns:
132            The concatenated content of all fetched articles.
133        """
134        return "\n".join(map(self.fetchArticle, slugs))

class KScraper: View Source

  9class KScraper:
 10    def __init__(self, folder="01 Content/articles", language="en") -> None:
 11        """
 12        Initializes the KScraper with a target folder and language.
 13
 14        Args:
 15            folder: Folder partial where articles will be saved.
 16            language: The language for Wikipedia articles.
 17        """
 18        # Prepend absolute path in case specimen not run from 03 DrawBot/
 19        self.folder = helpers.createFolder(
 20            f"/Users/christianjansky/Library/CloudStorage/Dropbox/KOMETA-Work/40 Scripts/03 DrawBot/{folder.rstrip('/')}-{language}/"
 21        )
 22        """Full path to the file system folder where articles will be saved."""
 23        wikipedia.set_lang(language)
 24
 25    def fetchArticle(self, slug: str) -> str:
 26        """
 27        Fetches a Wikipedia article by slug, saves it, and returns parsed content.
 28
 29        Args:
 30            slug: The Wikipedia page slug.
 31
 32        Returns:
 33            The parsed article content.
 34        """
 35
 36        def _findWikiTitle(slug: str) -> str:
 37            """
 38            Finds a valid Wikipedia page title or its closest match.
 39
 40            Args:
 41                slug: The Wikipedia page slug.
 42
 43            Returns:
 44                The resolved Wikipedia page title.
 45            """
 46            try:
 47                try:
 48                    # Find exact match
 49                    page = wikipedia.page(slug, auto_suggest=False)
 50                except wikipedia.exceptions.PageError as e:
 51                    # Correct spelling error
 52                    suggested = wikipedia.suggest(e.pageid)
 53                    logger.warning("Correcting {} to {}", e.pageid, suggested)
 54                    page = wikipedia.page(suggested, auto_suggest=False)
 55            except wikipedia.exceptions.DisambiguationError as e:
 56                # Pick first possible match if it may refer to multiple pages
 57                logger.warning("Possible pages for {}: {}", slug, e.options)
 58                page = wikipedia.page(helpers.pickFirst(e.options), auto_suggest=False)
 59
 60            return page.title
 61
 62        def _fixPageContent(input: str) -> str:
 63            """
 64            Fixes collapsed spacing in Wikipedia content.
 65
 66            Args:
 67                input: The raw Wikipedia content.
 68
 69            Returns:
 70                The fixed content with proper spacing.
 71            """
 72            output = re.sub(r"([a-z0-9]\.)([A-Z])", r"\1 \2", input)
 73            return output
 74
 75        def _saveArticle(slug: str) -> str:
 76            """
 77            Saves the Wikipedia article as a .txt file and returns its content.
 78
 79            Args:
 80                slug: The Wikipedia page slug.
 81
 82            Returns:
 83                The saved article content.
 84            """
 85            logger.debug("Saving article for {}", slug)
 86            with open(locateFile(slug), "w") as file:
 87                page = wikipedia.page(slug, auto_suggest=False)
 88                output = f"{page.url}\n\n{_fixPageContent(page.content)}"
 89                file.write(output)
 90                return output
 91
 92        def _parseArticle(input: str) -> str:
 93            """
 94            Parses the saved article content, removing URL and formatting.
 95
 96            Args:
 97                input: The raw article content.
 98
 99            Returns:
100                The cleaned article content.
101            """
102            # Strip article URL from first line
103            output = re.sub(r"^http.+\n+", "", input)
104            # Strip == Headline == formatting
105            output = re.sub(r"(?<=\n)=+\s+|\s+=+(?=\n)", "", output)
106
107            return output
108
109        pageTitle = _findWikiTitle(slug)
110        locateFile = (
111            lambda pageSlug: f"{self.folder}wiki-{content.toKebabCase(pageSlug)}.txt"
112        )
113        filePath = locateFile(pageTitle)
114
115        if helpers.isFile(filePath):
116            result = helpers.readFile(filePath)
117        else:
118            try:
119                result = _saveArticle(pageTitle)
120            except Exception as e:
121                logger.warning("Unable to save {}: {}", pageTitle, e)
122
123        return _parseArticle(result)
124
125    def fetchArticles(self, slugs: list[str]) -> str:
126        """
127        Fetches and parses multiple Wikipedia articles.
128
129        Args:
130            slugs: A list of Wikipedia page slugs.
131
132        Returns:
133            The concatenated content of all fetched articles.
134        """
135        return "\n".join(map(self.fetchArticle, slugs))

KScraper(folder='01 Content/articles', language='en') View Source

10    def __init__(self, folder="01 Content/articles", language="en") -> None:
11        """
12        Initializes the KScraper with a target folder and language.
13
14        Args:
15            folder: Folder partial where articles will be saved.
16            language: The language for Wikipedia articles.
17        """
18        # Prepend absolute path in case specimen not run from 03 DrawBot/
19        self.folder = helpers.createFolder(
20            f"/Users/christianjansky/Library/CloudStorage/Dropbox/KOMETA-Work/40 Scripts/03 DrawBot/{folder.rstrip('/')}-{language}/"
21        )
22        """Full path to the file system folder where articles will be saved."""
23        wikipedia.set_lang(language)

Initializes the KScraper with a target folder and language.

Arguments:

folder: Folder partial where articles will be saved.
language: The language for Wikipedia articles.

folder

Full path to the file system folder where articles will be saved.

def fetchArticle(self, slug: str) -> str: View Source

 25    def fetchArticle(self, slug: str) -> str:
 26        """
 27        Fetches a Wikipedia article by slug, saves it, and returns parsed content.
 28
 29        Args:
 30            slug: The Wikipedia page slug.
 31
 32        Returns:
 33            The parsed article content.
 34        """
 35
 36        def _findWikiTitle(slug: str) -> str:
 37            """
 38            Finds a valid Wikipedia page title or its closest match.
 39
 40            Args:
 41                slug: The Wikipedia page slug.
 42
 43            Returns:
 44                The resolved Wikipedia page title.
 45            """
 46            try:
 47                try:
 48                    # Find exact match
 49                    page = wikipedia.page(slug, auto_suggest=False)
 50                except wikipedia.exceptions.PageError as e:
 51                    # Correct spelling error
 52                    suggested = wikipedia.suggest(e.pageid)
 53                    logger.warning("Correcting {} to {}", e.pageid, suggested)
 54                    page = wikipedia.page(suggested, auto_suggest=False)
 55            except wikipedia.exceptions.DisambiguationError as e:
 56                # Pick first possible match if it may refer to multiple pages
 57                logger.warning("Possible pages for {}: {}", slug, e.options)
 58                page = wikipedia.page(helpers.pickFirst(e.options), auto_suggest=False)
 59
 60            return page.title
 61
 62        def _fixPageContent(input: str) -> str:
 63            """
 64            Fixes collapsed spacing in Wikipedia content.
 65
 66            Args:
 67                input: The raw Wikipedia content.
 68
 69            Returns:
 70                The fixed content with proper spacing.
 71            """
 72            output = re.sub(r"([a-z0-9]\.)([A-Z])", r"\1 \2", input)
 73            return output
 74
 75        def _saveArticle(slug: str) -> str:
 76            """
 77            Saves the Wikipedia article as a .txt file and returns its content.
 78
 79            Args:
 80                slug: The Wikipedia page slug.
 81
 82            Returns:
 83                The saved article content.
 84            """
 85            logger.debug("Saving article for {}", slug)
 86            with open(locateFile(slug), "w") as file:
 87                page = wikipedia.page(slug, auto_suggest=False)
 88                output = f"{page.url}\n\n{_fixPageContent(page.content)}"
 89                file.write(output)
 90                return output
 91
 92        def _parseArticle(input: str) -> str:
 93            """
 94            Parses the saved article content, removing URL and formatting.
 95
 96            Args:
 97                input: The raw article content.
 98
 99            Returns:
100                The cleaned article content.
101            """
102            # Strip article URL from first line
103            output = re.sub(r"^http.+\n+", "", input)
104            # Strip == Headline == formatting
105            output = re.sub(r"(?<=\n)=+\s+|\s+=+(?=\n)", "", output)
106
107            return output
108
109        pageTitle = _findWikiTitle(slug)
110        locateFile = (
111            lambda pageSlug: f"{self.folder}wiki-{content.toKebabCase(pageSlug)}.txt"
112        )
113        filePath = locateFile(pageTitle)
114
115        if helpers.isFile(filePath):
116            result = helpers.readFile(filePath)
117        else:
118            try:
119                result = _saveArticle(pageTitle)
120            except Exception as e:
121                logger.warning("Unable to save {}: {}", pageTitle, e)
122
123        return _parseArticle(result)

Fetches a Wikipedia article by slug, saves it, and returns parsed content.

Arguments:

slug: The Wikipedia page slug.

Returns:

The parsed article content.

def fetchArticles(self, slugs: list[str]) -> str: View Source

125    def fetchArticles(self, slugs: list[str]) -> str:
126        """
127        Fetches and parses multiple Wikipedia articles.
128
129        Args:
130            slugs: A list of Wikipedia page slugs.
131
132        Returns:
133            The concatenated content of all fetched articles.
134        """
135        return "\n".join(map(self.fetchArticle, slugs))

Fetches and parses multiple Wikipedia articles.

Arguments:

slugs: A list of Wikipedia page slugs.

Returns:

The concatenated content of all fetched articles.