#!/usr/bin/env python3 import sys import os import re import json import difflib import subprocess from hashlib import md5 from urllib.parse import urlsplit, quote, quote_plus, unquote cache_dir = "cache" def md5_url(url): return md5(url.encode("utf-8")).hexdigest() def get_cache_filename(url): return os.path.join(cache_dir, md5_url(url)) UA = "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.76 Safari/537.36" def get_body(url): fname = get_cache_filename(url) if os.path.exists(fname): with open(fname, "rb") as f: return f.read() else: subprocess.call(["wget", "--content-on-error", "-U", UA, url, "-O", fname]) with open(fname + ".info.json", "w") as f: f.write(json.dumps({"url": url})) with open(fname, "rb") as f: return f.read() def lower_escapes(url): assert isinstance(url, bytes), type(url) if b'%' not in url: return url return re.sub(b'(%[a-fA-F0-9]{2})', lambda m: m.group(1).lower(), url) def kill_path(path, body): body = body.replace(path.encode("utf-8"), b"") body = body.replace(path.encode("utf-8").replace(b"/", br"\/"), b"") body = body.replace(quote_plus(path).encode("utf-8"), b"") body = body.replace(lower_escapes(quote_plus(path).encode("utf-8")), b"") path_without_slashes = path.replace("/", "") if len(path_without_slashes) >= 5: body = body.replace(path_without_slashes.encode("utf-8"), b"") # For Dokuwiki path_underscored = path.replace("/", "_") body = body.replace(path_underscored.encode("utf-8"), b"") # For Drupal "jQuery.extend(Drupal.settings" line path_jsoned = '"' + path.replace("/", "\\u002F") + '"' body = body.replace(path_jsoned.encode("utf-8"), b"") if '%' in path: unquoted_path = unquote(path) if len(unquoted_path) >= 4: body = body.replace(quote_plus(unquoted_path).encode("utf-8"), b"") body = body.replace(lower_escapes(quote_plus(unquoted_path).encode("utf-8")), b"") return body def process_body(body, url): """ Return a post-processed page body that excludes irrelevant content that would prevent duplicate pages from being detected as duplicates. """ assert isinstance(body, bytes), type(body) drupal = b"Drupal" in body u = urlsplit(url) # Needed for www.tragnarion.com path = u.path.rstrip('/') if path.startswith('/'): path = path[1:] if len(path) >= 5: body = kill_path(path, body) # Drupal websites sometimes embed the current URL excluding the # first and/or second path component shorter_path = '/'.join(path.split('/')[2:]) if len(shorter_path) >= 50: body = kill_path(shorter_path, body) if len(u.query) >= 3: encoded_query = u.query.encode("utf-8") body = body.replace(('?' + u.query).encode("utf-8"), b"") body = body.replace(quote('?' + u.query).encode("utf-8"), b"") # Strip HTML comments, which sometimes include timestamps or # page generation stats body = re.sub(br'<\!--.{1,4000}?-->', b"", body, count=1000, flags=re.DOTALL) # Drupal generates a "theme_token":"..." inside a JSON blob # CloudFlare has a petok:"-1413059798-86400" body = re.sub(br'(petok|_token|applicationTime)"?:("[-_A-Za-z0-9\.]+"|[0-9\.]+)', b"", body) # Handle any 10-256 characters of hex or decimal # Minimum of 10 to handle UNIX timestamps body = re.sub(br'[A-Fa-f0-9\.]{10,256}', b"", body) # Spotted on http://mtnldelhi.in/: # id="tabber_container_0_991"> # id="tab_1-1_340"> # body = re.sub(br'\b(id|name|class)="[^"]{0,100}[-_]\d+"', b"", body) # Randomized anti-spam mailto: lines body = re.sub(br'([0-9a-fA-Fx]{2,4};){3,100}', b"", body) # Kill twitter and facebook share buttons, no matter what kind of # URL they stuffed in there. body = re.sub(br'
', b"", body) # Spotted on eff.org drupal body = re.sub(br'', b"", body) # Spotted on http://www.museodelvideojuego.com/ - handles # # Spotted on http://2045.com/ # ]{1,16384}?>', b"", body) # Spotted on http://www.communauteanimalcrossing.fr/ body = re.sub(br'', b"", body) # vbulletin body = re.sub(br'\(\d+ Viewing\)', b"", body) body = re.sub(br'Currently Active Users: \d+ \(\d+ members and \d+ guests\)', b"", body) # v= on http://vstreamers.com/v/images/css/p/videos # cb= on megahits.sapo.pt # pos= on www.smartcast.com.mx body = re.sub(br'[&\?]((v|cb)=\d+|pos=[A-Za-z0-9=]+)', b"", body) # spotted on espn.go.com and others body = re.sub(br'(splinks-|var hash = .|":"?)-?\d+', b"", body) # Kill newrelic inline script body = re.sub(br'window\.NREUM\|\|\(NREUM=\{\}\);NREUM\.info=\{.{1,3000}?\}', b"", body) if drupal: # Kill entire Drupal settings line body = re.sub(br'jQuery\.extend\(Drupal.settings, ?\{.{1,40000}?\}\);', b"", body) # Drupal generates this class id body = re.sub(br"\bview-dom-id-[0-9a-f]+\b", b"", body) # Drupal sites have randomized sidebar content with these IDs body = re.sub(br'