#!/usr/bin/env python # coding: utf-8 # This Source Code is subject to the terms of the Mozilla Public License # version 2.0 (the "License"). You can obtain a copy of the License at # http://mozilla.org/MPL/2.0/. import sys, os, re, subprocess, urllib2, time, traceback, codecs, hashlib, base64 from getopt import getopt, GetoptError acceptedExtensions = { '.txt': True, } ignore = { 'Apache.txt': True, 'CC-BY-SA.txt': True, 'GPL.txt': True, 'MPL.txt': True, } verbatim = { 'COPYING': True, } def combineSubscriptions(sourceDirs, targetDir, timeout=30): global acceptedExtensions, ignore, verbatim if isinstance(sourceDirs, basestring): sourceDirs = {'': sourceDirs} if not os.path.exists(targetDir): os.makedirs(targetDir, 0755) known = {} for sourceName, sourceDir in sourceDirs.iteritems(): for file in os.listdir(sourceDir): if file in ignore or file[0] == '.' or not os.path.isfile(os.path.join(sourceDir, file)): continue if file in verbatim: processVerbatimFile(sourceDir, targetDir, file) elif not os.path.splitext(file)[1] in acceptedExtensions: continue else: try: processSubscriptionFile(sourceName, sourceDirs, targetDir, file, timeout) except: print >>sys.stderr, 'Error processing subscription file "%s"' % file traceback.print_exc() print >>sys.stderr known[os.path.splitext(file)[0] + '.tpl'] = True known[os.path.splitext(file)[0] + '.tpl.gz'] = True known[file] = True known[file + '.gz'] = True for file in os.listdir(targetDir): if file[0] == '.': continue if not file in known: os.remove(os.path.join(targetDir, file)) def conditionalWrite(filePath, data): changed = True if os.path.exists(filePath): handle = codecs.open(filePath, 'rb', encoding='utf-8') oldData = handle.read() handle.close() checksumRegExp = re.compile(r'^.*!\s*checksum[\s\-:]+([\w\+\/=]+).*\n', re.M | re.I) oldData = re.sub(checksumRegExp, '', oldData) oldData = re.sub(r'\s*\d+ \w+ \d+ \d+:\d+ UTC', '', oldData) newData = re.sub(checksumRegExp, '', data) newData = re.sub(r'\s*\d+ \w+ \d+ \d+:\d+ UTC', '', newData) if oldData == newData: changed = False if changed: handle = codecs.open(filePath, 'wb', encoding='utf-8') handle.write(data) handle.close() try: subprocess.Popen(['7za', 'a', '-tgzip', '-mx=9', '-bd', '-mpass=15', filePath + '.gz', filePath], stdout=subprocess.PIPE).communicate() except: print >>sys.stderr, 'Failed to compress file %s. Please ensure that p7zip is installed on the system.' % filePath def processVerbatimFile(sourceDir, targetDir, file): handle = codecs.open(os.path.join(sourceDir, file), 'rb', encoding='utf-8') conditionalWrite(os.path.join(targetDir, file), handle.read()) handle.close() def processSubscriptionFile(sourceName, sourceDirs, targetDir, file, timeout): sourceDir = sourceDirs[sourceName] filePath = os.path.join(sourceDir, file) handle = codecs.open(filePath, 'rb', encoding='utf-8') lines = map(lambda l: re.sub(r'[\r\n]', '', l), handle.readlines()) handle.close() header = '' if len(lines) > 0: header = lines[0] del lines[0] if not re.search(r'\[Adblock(?:\s*Plus\s*([\d\.]+)?)?\]', header, re.I): raise Exception('This is not a valid Adblock Plus subscription file.') lines = resolveIncludes(sourceName, sourceDirs, filePath, lines, timeout) lines = filter(lambda l: l != '' and not re.search(r'!\s*checksum[\s\-:]+([\w\+\/=]+)', l, re.I), lines) writeTPL(os.path.join(targetDir, os.path.splitext(file)[0] + '.tpl'), lines) checksum = hashlib.md5() checksum.update((header + '\n' + '\n'.join(lines)).encode('utf-8')) lines.insert(0, '! Checksum: %s' % re.sub(r'=', '', base64.b64encode(checksum.digest()))) lines.insert(0, header) conditionalWrite(os.path.join(targetDir, file), '\n'.join(lines)) def resolveIncludes(sourceName, sourceDirs, filePath, lines, timeout, level=0): if level > 5: raise Exception('There are too many nested includes, which is probably the result of a circular reference somewhere.') result = [] for line in lines: match = re.search(r'^\s*%include\s+(.*)%\s*$', line) if match: file = match.group(1) newLines = None if re.match(r'^https?://', file): result.append('! *** Fetched from: %s ***' % file) for i in range(3): try: request = urllib2.urlopen(file, None, timeout) error = None break except urllib2.URLError, e: error = e time.sleep(5) if error: raise error charset = 'utf-8' contentType = request.headers.get('content-type', '') if contentType.find('charset=') >= 0: charset = contentType.split('charset=', 1)[1] newLines = unicode(request.read(), charset).split('\n') newLines = map(lambda l: re.sub(r'[\r\n]', '', l), newLines) newLines = filter(lambda l: not re.search(r'^\s*!.*?\bExpires\s*(?::|after)\s*(\d+)\s*(h)?', l, re.M | re.I), newLines) newLines = filter(lambda l: not re.search(r'^\s*!\s*(Redirect|Homepage|Title)\s*:', l, re.M | re.I), newLines) else: result.append('! *** %s ***' % file) includeSource = sourceName if file.find(':') >= 0: includeSource, file = file.split(':', 1) if not includeSource in sourceDirs: raise Exception('Cannot include file from repository "%s", this repository is unknown' % includeSource) parentDir = sourceDirs[includeSource] includePath = os.path.join(parentDir, file) relPath = os.path.relpath(includePath, parentDir) if len(relPath) == 0 or relPath[0] == '.': raise Exception('Invalid include "%s", needs to be an HTTP/HTTPS URL or a relative file path' % file) handle = codecs.open(includePath, 'rb', encoding='utf-8') newLines = map(lambda l: re.sub(r'[\r\n]', '', l), handle.readlines()) newLines = resolveIncludes(includeSource, sourceDirs, includePath, newLines, timeout, level + 1) handle.close() if len(newLines) and re.search(r'\[Adblock(?:\s*Plus\s*([\d\.]+)?)?\]', newLines[0], re.I): del newLines[0] result.extend(newLines) else: if line.find('%timestamp%') >= 0: if level == 0: line = line.replace('%timestamp%', time.strftime('%d %b %Y %H:%M UTC', time.gmtime())) else: line = '' result.append(line) return result def writeTPL(filePath, lines): result = [] result.append('msFilterList') for line in lines: if re.search(r'^!', line): # This is a comment. Handle "Expires" comment in a special way, keep the rest. match = re.search(r'\bExpires\s*(?::|after)\s*(\d+)\s*(h)?', line, re.I) if match: interval = int(match.group(1)) if match.group(2): interval = int(interval / 24) result.append(': Expires=%i' % interval) else: result.append(re.sub(r'!', '#', re.sub(r'--!$', '--#', line))) elif line.find('#') >= 0: # Element hiding rules are not supported in MSIE, drop them pass else: # We have a blocking or exception rule, try to convert it origLine = line isException = False if line[0:2] == '@@': isException = True line = line[2:] hasUnsupportedOptions = False requiresScript = False match = re.search(r'^(.*?)\$(.*)', line) if match: # This rule has options, check whether any of them are important line = match.group(1) options = match.group(2).replace('_', '-').lower().split(',') # Remove first-party only exceptions, we will allow an ad server everywhere otherwise if isException and '~third-party' in options: hasUnsupportedOptions = True # A number of options are not supported in MSIE but can be safely ignored, remove them options = filter(lambda o: not o in ('', 'third-party', '~third-party', 'match-case', '~match-case', '~other', '~donottrack'), options) # Also ignore domain negation of whitelists if isException: options = filter(lambda o: not o.startswith('domain=~'), options) unsupportedOptions = filter(lambda o: o in ('other', 'elemhide'), options) if unsupportedOptions and len(unsupportedOptions) == len(options): # The rule only applies to types that are not supported in MSIE hasUnsupportedOptions = True elif 'donottrack' in options: # Do-Not-Track rules have to be removed even if $donottrack is combined with other options hasUnsupportedOptions = True elif 'script' in options and len(options) == len(unsupportedOptions) + 1: # Mark rules that only apply to scripts for approximate conversion requiresScript = True elif len(options) > 0: # The rule has further options that aren't available in TPLs. For # exception rules that aren't specific to a domain we ignore all # remaining options to avoid potential false positives. Other rules # simply aren't included in the TPL file. if isException: hasUnsupportedOptions = any([o.startswith('domain=') for o in options]) else: hasUnsupportedOptions = True if hasUnsupportedOptions: # Do not include filters with unsupported options result.append('# ' + origLine) else: line = line.replace('^', '/') # Assume that separator placeholders mean slashes # Try to extract domain info domain = None match = re.search(r'^(\|\||\|\w+://)([^*:/]+)(:\d+)?(/.*)', line) if match: domain = match.group(2) line = match.group(4) else: # No domain info, remove anchors at the rule start line = re.sub(r'^\|\|', 'http://', line) line = re.sub(r'^\|', '', line) # Remove anchors at the rule end line = re.sub(r'\|$', '', line) # Remove unnecessary asterisks at the ends of lines line = re.sub(r'\*$', '', line) # Emulate $script by appending *.js to the rule if requiresScript: line += '*.js' if line.startswith('/*'): line = line[2:] if domain: line = '%sd %s %s' % ('+' if isException else '-', domain, line) line = re.sub(r'\s+/$', '', line) result.append(line) elif isException: # Exception rules without domains are unsupported result.append('# ' + origLine) else: result.append('- ' + line) conditionalWrite(filePath, '\n'.join(result) + '\n') def usage(): print '''Usage: %s [source_dir] [output_dir] Options: -h --help Print this message and exit -t seconds --timeout=seconds Timeout when fetching remote subscriptions ''' % os.path.basename(sys.argv[0]) if __name__ == '__main__': try: opts, args = getopt(sys.argv[1:], 'ht:', ['help', 'timeout=']) except GetoptError, e: print str(e) usage() sys.exit(2) sourceDir, targetDir = '.', 'subscriptions' if len(args) >= 1: sourceDir = args[0] if len(args) >= 2: targetDir = args[1] timeout = 30 for option, value in opts: if option in ('-h', '--help'): usage() sys.exit() elif option in ('-t', '--timeout'): timeout = int(value) if os.path.exists(os.path.join(sourceDir, '.hg')): # Our source is a Mercurial repository, try updating subprocess.Popen(['hg', '-R', sourceDir, 'pull', '--update']).communicate() combineSubscriptions(sourceDir, targetDir, timeout)