', b"", body) # Spotted on eff.org drupal body = re.sub(br'', b"", body) # Spotted on http://www.museodelvideojuego.com/ - handles # # Spotted on http://2045.com/ # ]{1,16384}?>', b"", body) # Spotted on http://www.communauteanimalcrossing.fr/ body = re.sub(br'', b"", body) # vbulletin body = re.sub(br'\(\d+ Viewing\)', b"", body) body = re.sub(br'Currently Active Users: \d+ \(\d+ members and \d+ guests\)', b"", body) # v= on http://vstreamers.com/v/images/css/p/videos # cb= on megahits.sapo.pt # pos= on www.smartcast.com.mx body = re.sub(br'[&\?]((v|cb)=\d+|pos=[A-Za-z0-9=]+)', b"", body) # spotted on espn.go.com and others body = re.sub(br'(splinks-|var hash = .|":"?)-?\d+', b"", body) # Kill newrelic inline script body = re.sub(br'window\.NREUM\|\|\(NREUM=\{\}\);NREUM\.info=\{.{1,3000}?\}', b"", body) if drupal: # Kill entire Drupal settings line body = re.sub(br'jQuery\.extend\(Drupal.settings, ?\{.{1,40000}?\}\);', b"", body) # Drupal generates this class id body = re.sub(br"\bview-dom-id-[0-9a-f]+\b", b"", body) # Drupal sites have randomized sidebar content with these IDs body = re.sub(br'

.*', b"", body) # nsslabs.com has this body = re.sub(br'', b"", body) # sbs.com.au has generated /css_ filenames body = re.sub(br'/css_[-_A-Za-z0-9]{10,100}\.css', b"", body) # stopbadware.org has some differing autogenerated ', b"", body) # Drupal generates items based on the URL # Generated class="" also spotted on non-Drupal www.minutouno.com # Duplicate class="" on stopbadware.org body = re.sub(br'<(body|div)( id="[^"]+")? class="[^"]+"( class="[^"]+")?( data-src="[^"]{1,2000}")?', b"", body) return body def compare_bodies(body1, body2, url1, url2): # TODO: handle non-utf-8 bodies for line in difflib.unified_diff( body1.decode("utf-8", "replace").splitlines(keepends=True), body2.decode("utf-8", "replace").splitlines(keepends=True), fromfile=url1, tofile=url2): if not "\n" in line: line += "\n" sys.stdout.buffer.write(line.encode("utf-8")) def compare_unprocessed_bodies(up_body1, up_body2, url1, url2): body1 = process_body(up_body1, url1) body2 = process_body(up_body2, url2) print("{} == md5({!r})".format(md5_url(url1), url1)) print("{} == md5({!r})".format(md5_url(url2), url2)) print("After processing,") print("len(body({!r})) == {}".format(url1, len(body1))) print("len(body({!r})) == {}".format(url2, len(body2))) compare_bodies(body1, body2, url1, url2) def main(): try: os.makedirs(cache_dir) except OSError: pass assert os.path.exists(cache_dir) if len(sys.argv) == 2: # Just save and print the body print(get_body(sys.argv[1])) elif len(sys.argv) == 3: url1, url2 = sys.argv[1], sys.argv[2] compare_unprocessed_bodies(get_body(url1), get_body(url2), url1, url2) else: assert 0, sys.argv if __name__ == '__main__': main()