Skip to content

Commit

Permalink
Refactor Reddit scrapers into a more reasonable code structure
Browse files Browse the repository at this point in the history
Cf. #328
  • Loading branch information
JustAnotherArchivist committed Dec 24, 2021
1 parent 4dd3ee6 commit eee06d8
Showing 1 changed file with 31 additions and 23 deletions.
54 changes: 31 additions & 23 deletions snscrape/modules/reddit.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,33 +202,41 @@ def _setup_parser_opts(cls, subparser):
subparser.add_argument('--after', metavar = 'TIMESTAMP', type = int, help = 'Fetch results after a Unix timestamp')


def _make_scraper(name_, validationFunc, apiField):
class Scraper(RedditPushshiftScraper):
name = f'reddit-{name_}'
class RedditScraper(RedditPushshiftScraper):
def __init__(self, name, **kwargs):
super().__init__(**kwargs)
self._name = name
if not type(self)._validationFunc(self._name):
raise ValueError(f'invalid {type(self).name.split("-", 1)[1]} name')

def get_items(self):
yield from self._iter_api_submissions_and_comments({type(self)._apiField: self._name})

def __init__(self, name, **kwargs):
super().__init__(**kwargs)
self._name = name
if not validationFunc(self._name):
raise ValueError(f'invalid {name_} name')
@classmethod
def setup_parser(cls, subparser):
super()._setup_parser_opts(subparser)
name = cls.name.split('-', 1)[1]
subparser.add_argument(name, type = snscrape.base.nonempty_string(name))

@classmethod
def from_args(cls, args):
name = cls.name.split('-', 1)[1]
return cls._construct(args, getattr(args, name), submissions = not args.noSubmissions, comments = not args.noComments, before = args.before, after = args.after)

def get_items(self):
yield from self._iter_api_submissions_and_comments({apiField: self._name})

@classmethod
def setup_parser(cls, subparser):
super()._setup_parser_opts(subparser)
subparser.add_argument(name_, type = snscrape.base.nonempty_string(name_))
class RedditUserScraper(RedditScraper):
name = 'reddit-user'
_validationFunc = lambda x: re.match('^[A-Za-z0-9_-]{3,20}$', x)
_apiField = 'author'

@classmethod
def from_args(cls, args):
return cls._construct(args, getattr(args, name_), submissions = not args.noSubmissions, comments = not args.noComments, before = args.before, after = args.after)

Scraper.__name__ = f'Reddit{name_.capitalize()}Scraper'
Scraper.__qualname__ = Scraper.__name__
globals()[Scraper.__name__] = Scraper
class RedditSubredditScraper(RedditScraper):
name = 'reddit-subreddit'
_validationFunc = lambda x: re.match('^[A-Za-z0-9][A-Za-z0-9_]{2,20}$', x)
_apiField = 'subreddit'


_make_scraper('user', lambda x: re.match('^[A-Za-z0-9_-]{3,20}$', x), 'author')
_make_scraper('subreddit', lambda x: re.match('^[A-Za-z0-9][A-Za-z0-9_]{2,20}$', x), 'subreddit')
_make_scraper('search', lambda x: True, 'q')
class RedditSearchScraper(RedditScraper):
name = 'reddit-search'
_validationFunc = lambda x: True
_apiField = 'q'

0 comments on commit eee06d8

Please sign in to comment.