Move everything and make grab-site installable with pip3

This commit is contained in:
Ivan Kozik 2015-07-18 10:29:49 +00:00
parent 1266cf6c97
commit 43d8a9594f
13 changed files with 42 additions and 8 deletions

2
.gitignore vendored
View File

@ -1 +1 @@
/__pycache__
__pycache__

View File

@ -37,17 +37,18 @@ echo "global,$igsets" > "$dir/igsets"
touch "$dir/igoff"
touch "$dir/ignores"
LIBGRABSITE="$(python3 -c 'import os, libgrabsite; print(os.path.dirname(libgrabsite.__file__))')"
# Note: we use the default html5lib parser instead of the lxml that ArchiveBot uses
# html5lib is slower, but is better at parsing and doesn't (rarely) corrupt the heap like lxml
GRAB_SITE_WORKING_DIR="$dir" PYTHONPATH="$self" "$self/wpull" \
GRAB_SITE_WORKING_DIR="$dir" "$self/patched-wpull" \
-U "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0" \
--header="Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" \
--header="Accept-Language: en-US,en;q=0.5" \
-o "$dir/wpull.log" \
--database "$dir/wpull.db" \
--plugin-script "$self/plugin.py" \
--python-script "$self/wpull_hooks.py" \
--plugin-script "$LIBGRABSITE/plugin.py" \
--python-script "$LIBGRABSITE/wpull_hooks.py" \
--plugin-args " --dupes-db $dir/dupes_db" \
--save-cookies "$dir/cookies.txt" \
--no-check-certificate \

4
gs-server Normal file
View File

@ -0,0 +1,4 @@
#!/usr/bin/python3
from libgrabsite import server
server.main()

1
libgrabsite/__init__.py Normal file
View File

@ -0,0 +1 @@
__version__ = '0.1.0'

View File

@ -6,8 +6,8 @@ from wpull.database.sqltable import SQLiteURLTable
from wpull.document.html import HTMLReader
import wpull.processor.rule
import dupespotter
from dupes import DupesInMemory, DupesOnDisk
from libgrabsite import dupespotter
from libgrabsite.dupes import DupesInMemory, DupesOnDisk

View File

@ -7,7 +7,7 @@ import signal
import trollius as asyncio
from urllib.request import urlopen
from autobahn.asyncio.websocket import WebSocketClientFactory, WebSocketClientProtocol
from ignoracle import Ignoracle, parameterize_record_info
from libgrabsite.ignoracle import Ignoracle, parameterize_record_info
realStdoutWrite = sys.stdout.buffer.write
realStderrWrite = sys.stderr.buffer.write

28
setup.py Normal file
View File

@ -0,0 +1,28 @@
#!/usr/bin/python3
try:
from setuptools import setup
except ImportError:
from distutils.core import setup
import libgrabsite
setup(
name="grab-site",
version=libgrabsite.__version__,
description="The archivist's web crawler: WARC output, dashboard for all crawls, dynamic ignore patterns",
url="https://github.com/ludios/grab-site",
author="Ivan Kozik",
author_email="ivan@ludios.org",
classifiers=[
"Programming Language :: Python :: 3",
"Development Status :: 3 - Alpha",
"Intended Audience :: End Users/Desktop",
"License :: OSI Approved :: MIT License",
"Topic :: Internet :: WWW/HTTP",
],
scripts=["grab-site", "gs-server", "patched-wpull"],
packages=["libgrabsite"],
package_data={"libgrabsite": ["*.html"]},
install_requires=["wpull", "manhole", "lmdb", "autobahn", "aiohttp", "trollius"],
)