grab-site/libgrabsite/ignoracle.py

import re
import sys

from urllib.parse import urlparse

# Increase the compiled regexp cache limit from 512 to 4096 in case
# someone uses a lot of ignores.
re._MAXCACHE = 4096


class Ignoracle(object):
	"""
	An Ignoracle tests a URL against a list of patterns and returns whether or
	not that URL should be grabbed.

	An Ignoracle's pattern list starts as the empty list.
	"""

	patterns = []

	def set_patterns(self, strings):
		"""
		Given a list of strings, replaces this Ignoracle's pattern state with
		that list.
		"""
		self.patterns = []

		for string in strings:
			if isinstance(string, bytes):
				string = string.decode('utf-8')

			self.patterns.append(string)

	def ignores(self, url, **kwargs):
		"""
		If an ignore pattern matches the given URL, returns that pattern as a string.
		Otherwise, returns False.
		"""
		pu = re.escape(kwargs.get('primary_url', ''))
		ph = re.escape(kwargs.get('primary_netloc', ''))

		for pattern in self.patterns:
			regexp = pattern
			if '{' in regexp:
				regexp = regexp.replace('{primary_url}', pu).replace('{primary_netloc}', ph)
			try:
				if re.search(regexp, url):
					return pattern
			except re.error as error:
				print('Pattern %s is invalid (error: %s).  Ignored.' % (pattern, str(error)), file=sys.stderr)

		return False


def parameterize_record_info(record_info):
	"""
	Given a wpull record_info dict, generates a dict with primary_url and
	primary_netloc keys.  This is meant to be used in Ignoracle.ignores.

	The primary_url key is:

	1. record_info['top_url'], or
	2. record_info['url'] if record_info['level'] is zero, or
	3. None otherwise.

	If primary_url is a valid URL, the primary_netloc key is the network
	location component of primary_url (i.e. for HTTP,
	[user:password@]host[:port]).  Otherwise, primary_netloc is None.
	"""
	primary_url = None
	primary_netloc = None

	if record_info.get('level') == 0:
		primary_url = record_info.get('url')
	else:
		primary_url = record_info.get('top_url')

	if primary_url:
		parsed = urlparse(primary_url)
		primary_netloc = parsed.netloc

	return dict(
		primary_url=primary_url,
		primary_netloc=primary_netloc
	)
Use global ignore set and also ignore Icecast sites like ArchiveBot 2015-02-05 03:59:21 +00:00			`import re`
			`import sys`

			`from urllib.parse import urlparse`

Increase size of compiled regexp cache; remove unused code 2015-08-12 07:52:24 +00:00			`# Increase the compiled regexp cache limit from 512 to 4096 in case`
			`# someone uses a lot of ignores.`
			`re._MAXCACHE = 4096`
Spaces -> tabs 2015-07-18 11:00:08 +00:00
Use global ignore set and also ignore Icecast sites like ArchiveBot 2015-02-05 03:59:21 +00:00
			`class Ignoracle(object):`
Spaces -> tabs 2015-07-18 11:00:08 +00:00			`"""`
			`An Ignoracle tests a URL against a list of patterns and returns whether or`
			`not that URL should be grabbed.`
Use global ignore set and also ignore Icecast sites like ArchiveBot 2015-02-05 03:59:21 +00:00
Spaces -> tabs 2015-07-18 11:00:08 +00:00			`An Ignoracle's pattern list starts as the empty list.`
			`"""`
Use global ignore set and also ignore Icecast sites like ArchiveBot 2015-02-05 03:59:21 +00:00
Spaces -> tabs 2015-07-18 11:00:08 +00:00			`patterns = []`
Use global ignore set and also ignore Icecast sites like ArchiveBot 2015-02-05 03:59:21 +00:00
Spaces -> tabs 2015-07-18 11:00:08 +00:00			`def set_patterns(self, strings):`
			`"""`
			`Given a list of strings, replaces this Ignoracle's pattern state with`
			`that list.`
			`"""`
			`self.patterns = []`
Use global ignore set and also ignore Icecast sites like ArchiveBot 2015-02-05 03:59:21 +00:00
Spaces -> tabs 2015-07-18 11:00:08 +00:00			`for string in strings:`
			`if isinstance(string, bytes):`
			`string = string.decode('utf-8')`
Use global ignore set and also ignore Icecast sites like ArchiveBot 2015-02-05 03:59:21 +00:00
Spaces -> tabs 2015-07-18 11:00:08 +00:00			`self.patterns.append(string)`
Use global ignore set and also ignore Icecast sites like ArchiveBot 2015-02-05 03:59:21 +00:00
Spaces -> tabs 2015-07-18 11:00:08 +00:00			`def ignores(self, url, **kwargs):`
			`"""`
			`If an ignore pattern matches the given URL, returns that pattern as a string.`
			`Otherwise, returns False.`
			`"""`
Fix very recent regression: report the pattern instead of the regexp 2015-08-12 08:51:41 +00:00			`pu = re.escape(kwargs.get('primary_url', ''))`
			`ph = re.escape(kwargs.get('primary_netloc', ''))`
Use global ignore set and also ignore Icecast sites like ArchiveBot 2015-02-05 03:59:21 +00:00
Spaces -> tabs 2015-07-18 11:00:08 +00:00			`for pattern in self.patterns:`
Fix very recent regression: report the pattern instead of the regexp 2015-08-12 08:51:41 +00:00			`regexp = pattern`
			`if '{' in regexp:`
			`regexp = regexp.replace('{primary_url}', pu).replace('{primary_netloc}', ph)`
Spaces -> tabs 2015-07-18 11:00:08 +00:00			`try:`
Fix very recent regression: report the pattern instead of the regexp 2015-08-12 08:51:41 +00:00			`if re.search(regexp, url):`
Spaces -> tabs 2015-07-18 11:00:08 +00:00			`return pattern`
			`except re.error as error:`
			`print('Pattern %s is invalid (error: %s). Ignored.' % (pattern, str(error)), file=sys.stderr)`
Use global ignore set and also ignore Icecast sites like ArchiveBot 2015-02-05 03:59:21 +00:00
Spaces -> tabs 2015-07-18 11:00:08 +00:00			`return False`
Use global ignore set and also ignore Icecast sites like ArchiveBot 2015-02-05 03:59:21 +00:00
Increase size of compiled regexp cache; remove unused code 2015-08-12 07:52:24 +00:00
Use global ignore set and also ignore Icecast sites like ArchiveBot 2015-02-05 03:59:21 +00:00			`def parameterize_record_info(record_info):`
Spaces -> tabs 2015-07-18 11:00:08 +00:00			`"""`
			`Given a wpull record_info dict, generates a dict with primary_url and`
			`primary_netloc keys. This is meant to be used in Ignoracle.ignores.`
Use global ignore set and also ignore Icecast sites like ArchiveBot 2015-02-05 03:59:21 +00:00
Spaces -> tabs 2015-07-18 11:00:08 +00:00			`The primary_url key is:`
Use global ignore set and also ignore Icecast sites like ArchiveBot 2015-02-05 03:59:21 +00:00
Spaces -> tabs 2015-07-18 11:00:08 +00:00			`1. record_info['top_url'], or`
			`2. record_info['url'] if record_info['level'] is zero, or`
			`3. None otherwise.`
Use global ignore set and also ignore Icecast sites like ArchiveBot 2015-02-05 03:59:21 +00:00
Spaces -> tabs 2015-07-18 11:00:08 +00:00			`If primary_url is a valid URL, the primary_netloc key is the network`
			`location component of primary_url (i.e. for HTTP,`
			`[user:password@]host[:port]). Otherwise, primary_netloc is None.`
			`"""`
			`primary_url = None`
			`primary_netloc = None`
Use global ignore set and also ignore Icecast sites like ArchiveBot 2015-02-05 03:59:21 +00:00
Spaces -> tabs 2015-07-18 11:00:08 +00:00			`if record_info.get('level') == 0:`
			`primary_url = record_info.get('url')`
			`else:`
			`primary_url = record_info.get('top_url')`
Use global ignore set and also ignore Icecast sites like ArchiveBot 2015-02-05 03:59:21 +00:00
Spaces -> tabs 2015-07-18 11:00:08 +00:00			`if primary_url:`
			`parsed = urlparse(primary_url)`
			`primary_netloc = parsed.netloc`
Use global ignore set and also ignore Icecast sites like ArchiveBot 2015-02-05 03:59:21 +00:00
Spaces -> tabs 2015-07-18 11:00:08 +00:00			`return dict(`
			`primary_url=primary_url,`
			`primary_netloc=primary_netloc`
			`)`