222 lines
7.4 KiB
Python
222 lines
7.4 KiB
Python
#!/usr/bin/env python3
|
|
|
|
import sys
|
|
import os
|
|
import re
|
|
import json
|
|
import difflib
|
|
import subprocess
|
|
|
|
from hashlib import md5
|
|
from urllib.parse import urlsplit, quote, quote_plus, unquote
|
|
|
|
cache_dir = "cache"
|
|
|
|
|
|
def md5_url(url):
|
|
return md5(url.encode("utf-8")).hexdigest()
|
|
|
|
|
|
def get_cache_filename(url):
|
|
return os.path.join(cache_dir, md5_url(url))
|
|
|
|
|
|
UA = "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.76 Safari/537.36"
|
|
|
|
def get_body(url):
|
|
fname = get_cache_filename(url)
|
|
if os.path.exists(fname):
|
|
with open(fname, "rb") as f:
|
|
return f.read()
|
|
else:
|
|
subprocess.call(["wget", "--content-on-error", "-U", UA, url, "-O", fname])
|
|
with open(fname + ".info.json", "w") as f:
|
|
f.write(json.dumps({"url": url}))
|
|
with open(fname, "rb") as f:
|
|
return f.read()
|
|
|
|
|
|
def lower_escapes(url):
|
|
assert isinstance(url, bytes), type(url)
|
|
if b'%' not in url:
|
|
return url
|
|
return re.sub(b'(%[a-fA-F0-9]{2})', lambda m: m.group(1).lower(), url)
|
|
|
|
|
|
def kill_path(path, body):
|
|
body = body.replace(path.encode("utf-8"), b"")
|
|
body = body.replace(path.encode("utf-8").replace(b"/", br"\/"), b"")
|
|
body = body.replace(quote_plus(path).encode("utf-8"), b"")
|
|
body = body.replace(lower_escapes(quote_plus(path).encode("utf-8")), b"")
|
|
path_without_slashes = path.replace("/", "")
|
|
if len(path_without_slashes) >= 5:
|
|
body = body.replace(path_without_slashes.encode("utf-8"), b"")
|
|
# For Dokuwiki
|
|
path_underscored = path.replace("/", "_")
|
|
body = body.replace(path_underscored.encode("utf-8"), b"")
|
|
# For Drupal "jQuery.extend(Drupal.settings" line
|
|
path_jsoned = '"' + path.replace("/", "\\u002F") + '"'
|
|
body = body.replace(path_jsoned.encode("utf-8"), b"")
|
|
if '%' in path:
|
|
unquoted_path = unquote(path)
|
|
if len(unquoted_path) >= 4:
|
|
body = body.replace(quote_plus(unquoted_path).encode("utf-8"), b"")
|
|
body = body.replace(lower_escapes(quote_plus(unquoted_path).encode("utf-8")), b"")
|
|
return body
|
|
|
|
|
|
def process_body(body, url):
|
|
"""
|
|
Return a post-processed page body that excludes irrelevant content
|
|
that would prevent duplicate pages from being detected as duplicates.
|
|
"""
|
|
assert isinstance(body, bytes), type(body)
|
|
|
|
drupal = b"Drupal" in body
|
|
|
|
u = urlsplit(url)
|
|
# Needed for www.tragnarion.com
|
|
path = u.path.rstrip('/')
|
|
if path.startswith('/'):
|
|
path = path[1:]
|
|
if len(path) >= 5:
|
|
body = kill_path(path, body)
|
|
|
|
# Drupal websites sometimes embed the current URL excluding the
|
|
# first and/or second path component
|
|
shorter_path = '/'.join(path.split('/')[2:])
|
|
if len(shorter_path) >= 50:
|
|
body = kill_path(shorter_path, body)
|
|
|
|
if len(u.query) >= 3:
|
|
encoded_query = u.query.encode("utf-8")
|
|
body = body.replace(('?' + u.query).encode("utf-8"), b"")
|
|
body = body.replace(quote('?' + u.query).encode("utf-8"), b"")
|
|
|
|
# Strip HTML comments, which sometimes include timestamps or
|
|
# page generation stats
|
|
body = re.sub(br'<\!--.{1,4000}?-->', b"", body, count=1000, flags=re.DOTALL)
|
|
|
|
# Drupal generates a "theme_token":"..." inside a JSON blob
|
|
# CloudFlare has a petok:"-1413059798-86400"
|
|
body = re.sub(br'(petok|_token|applicationTime)"?:("[-_A-Za-z0-9\.]+"|[0-9\.]+)', b"", body)
|
|
|
|
# Handle any 10-256 characters of hex or decimal
|
|
# Minimum of 10 to handle UNIX timestamps
|
|
body = re.sub(br'[A-Fa-f0-9\.]{10,256}', b"", body)
|
|
|
|
# Spotted on http://mtnldelhi.in/:
|
|
# id="tabber_container_0_991">
|
|
# id="tab_1-1_340">
|
|
# <a name="tab_1-1_340">
|
|
body = re.sub(br'\b(id|name|class)="[^"]{0,100}[-_]\d+"', b"", body)
|
|
|
|
# Randomized anti-spam mailto: lines
|
|
body = re.sub(br'<a href="mailto:[^"@]{1,100}@[^"]{2,100}">(&#[0-9a-fA-Fx]{2,4};){3,100}</a>', b"", body)
|
|
|
|
# Kill twitter and facebook share buttons, no matter what kind of
|
|
# URL they stuffed in there.
|
|
body = re.sub(br'<div class="fb-like" data-href=".*?</div>', b"", body)
|
|
body = re.sub(br'<a href="https?://twitter.com/share[^"]{0,4096}" class="twitter-share-button.*?</a>', b"", body)
|
|
|
|
# Drupal puts the current URL here, and the casing doesn't always match
|
|
body = re.sub(br'<(link rel="(canonical|shortlink|alternate)".{1,1000}?href=|meta property="og:url" content=)"[^"]+" />', b"", body)
|
|
|
|
# Spotted on eff.org drupal
|
|
body = re.sub(br'<link href="[^"]+" rel="alternate" hreflang="[^"]+" />', b"", body)
|
|
|
|
# Spotted on http://www.museodelvideojuego.com/ - handles
|
|
# <input type="hidden" name="form_build_id" value="form-ddmhsyCMnpZsHKCQN-l6R1j9EwMT3lHKDI4xXcyFcBA" />
|
|
# Spotted on http://2045.com/
|
|
# <input type="hidden" name="file_uploadToken" value="\d+"
|
|
body = re.sub(br'<input type="hidden"[^>]{1,16384}?>', b"", body)
|
|
|
|
# Spotted on http://www.communauteanimalcrossing.fr/
|
|
body = re.sub(br'<param name="flashvars" value="servannee=\d{4}&servmois=\d{1,2}&servjour=\d{1,2}&servheure=\d{1,2}&servminute=\d{1,2}&servseconde=\d{1,2}" />', b"", body)
|
|
|
|
# vbulletin
|
|
body = re.sub(br'\(\d+ Viewing\)', b"", body)
|
|
body = re.sub(br'Currently Active Users</a>: \d+ \(\d+ members and \d+ guests\)', b"", body)
|
|
|
|
# v= on http://vstreamers.com/v/images/css/p/videos
|
|
# cb= on megahits.sapo.pt
|
|
# pos= on www.smartcast.com.mx
|
|
body = re.sub(br'[&\?]((v|cb)=\d+|pos=[A-Za-z0-9=]+)', b"", body)
|
|
|
|
# spotted on espn.go.com and others
|
|
body = re.sub(br'(splinks-|var hash = .|":"?)-?\d+', b"", body)
|
|
|
|
# Kill newrelic inline script
|
|
body = re.sub(br'window\.NREUM\|\|\(NREUM=\{\}\);NREUM\.info=\{.{1,3000}?\}', b"", body)
|
|
|
|
if drupal:
|
|
# Kill entire Drupal settings line
|
|
body = re.sub(br'jQuery\.extend\(Drupal.settings, ?\{.{1,40000}?\}\);', b"", body)
|
|
|
|
# Drupal generates this class id
|
|
body = re.sub(br"\bview-dom-id-[0-9a-f]+\b", b"", body)
|
|
|
|
# Drupal sites have randomized sidebar content with these IDs
|
|
body = re.sub(br'<div class="views-field views-field-[-a-z]+">.*', b"", body)
|
|
|
|
# nsslabs.com has this
|
|
body = re.sub(br'<div class="breadcrumb">.{1,4000}? </div>', b"", body)
|
|
|
|
# sbs.com.au has generated /css_ filenames
|
|
body = re.sub(br'/css_[-_A-Za-z0-9]{10,100}\.css', b"", body)
|
|
|
|
# stopbadware.org has some differing autogenerated <style>
|
|
body = re.sub(br'<style type="text/css" media="all">@import url\(.{1,4096}?\);</style>', b"", body)
|
|
|
|
# Drupal generates <body class="..."> items based on the URL
|
|
# Generated class="" also spotted on non-Drupal www.minutouno.com
|
|
# Duplicate class="" on stopbadware.org
|
|
body = re.sub(br'<(body|div)( id="[^"]+")? class="[^"]+"( class="[^"]+")?( data-src="[^"]{1,2000}")?', b"", body)
|
|
|
|
return body
|
|
|
|
|
|
def compare_bodies(body1, body2, url1, url2):
|
|
# TODO: handle non-utf-8 bodies
|
|
for line in difflib.unified_diff(
|
|
body1.decode("utf-8", "replace").splitlines(keepends=True),
|
|
body2.decode("utf-8", "replace").splitlines(keepends=True),
|
|
fromfile=url1,
|
|
tofile=url2):
|
|
if not "\n" in line:
|
|
line += "\n"
|
|
sys.stdout.buffer.write(line.encode("utf-8"))
|
|
|
|
|
|
def compare_unprocessed_bodies(up_body1, up_body2, url1, url2):
|
|
body1 = process_body(up_body1, url1)
|
|
body2 = process_body(up_body2, url2)
|
|
print("{} == md5({!r})".format(md5_url(url1), url1))
|
|
print("{} == md5({!r})".format(md5_url(url2), url2))
|
|
print("After processing,")
|
|
print("len(body({!r})) == {}".format(url1, len(body1)))
|
|
print("len(body({!r})) == {}".format(url2, len(body2)))
|
|
compare_bodies(body1, body2, url1, url2)
|
|
|
|
|
|
def main():
|
|
try:
|
|
os.makedirs(cache_dir)
|
|
except OSError:
|
|
pass
|
|
|
|
assert os.path.exists(cache_dir)
|
|
|
|
if len(sys.argv) == 2:
|
|
# Just save and print the body
|
|
print(get_body(sys.argv[1]))
|
|
elif len(sys.argv) == 3:
|
|
url1, url2 = sys.argv[1], sys.argv[2]
|
|
compare_unprocessed_bodies(get_body(url1), get_body(url2), url1, url2)
|
|
else:
|
|
assert 0, sys.argv
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|