Move everything and make grab-site installable with pip3
This commit is contained in:
parent
1266cf6c97
commit
43d8a9594f
2
.gitignore
vendored
2
.gitignore
vendored
@ -1 +1 @@
|
||||
/__pycache__
|
||||
__pycache__
|
||||
|
@ -37,17 +37,18 @@ echo "global,$igsets" > "$dir/igsets"
|
||||
touch "$dir/igoff"
|
||||
touch "$dir/ignores"
|
||||
|
||||
LIBGRABSITE="$(python3 -c 'import os, libgrabsite; print(os.path.dirname(libgrabsite.__file__))')"
|
||||
|
||||
# Note: we use the default html5lib parser instead of the lxml that ArchiveBot uses
|
||||
# html5lib is slower, but is better at parsing and doesn't (rarely) corrupt the heap like lxml
|
||||
|
||||
GRAB_SITE_WORKING_DIR="$dir" PYTHONPATH="$self" "$self/wpull" \
|
||||
GRAB_SITE_WORKING_DIR="$dir" "$self/patched-wpull" \
|
||||
-U "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0" \
|
||||
--header="Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" \
|
||||
--header="Accept-Language: en-US,en;q=0.5" \
|
||||
-o "$dir/wpull.log" \
|
||||
--database "$dir/wpull.db" \
|
||||
--plugin-script "$self/plugin.py" \
|
||||
--python-script "$self/wpull_hooks.py" \
|
||||
--plugin-script "$LIBGRABSITE/plugin.py" \
|
||||
--python-script "$LIBGRABSITE/wpull_hooks.py" \
|
||||
--plugin-args " --dupes-db $dir/dupes_db" \
|
||||
--save-cookies "$dir/cookies.txt" \
|
||||
--no-check-certificate \
|
||||
|
4
gs-server
Normal file
4
gs-server
Normal file
@ -0,0 +1,4 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
from libgrabsite import server
|
||||
server.main()
|
1
libgrabsite/__init__.py
Normal file
1
libgrabsite/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
__version__ = '0.1.0'
|
@ -6,8 +6,8 @@ from wpull.database.sqltable import SQLiteURLTable
|
||||
from wpull.document.html import HTMLReader
|
||||
import wpull.processor.rule
|
||||
|
||||
import dupespotter
|
||||
from dupes import DupesInMemory, DupesOnDisk
|
||||
from libgrabsite import dupespotter
|
||||
from libgrabsite.dupes import DupesInMemory, DupesOnDisk
|
||||
|
||||
|
||||
|
@ -7,7 +7,7 @@ import signal
|
||||
import trollius as asyncio
|
||||
from urllib.request import urlopen
|
||||
from autobahn.asyncio.websocket import WebSocketClientFactory, WebSocketClientProtocol
|
||||
from ignoracle import Ignoracle, parameterize_record_info
|
||||
from libgrabsite.ignoracle import Ignoracle, parameterize_record_info
|
||||
|
||||
realStdoutWrite = sys.stdout.buffer.write
|
||||
realStderrWrite = sys.stderr.buffer.write
|
28
setup.py
Normal file
28
setup.py
Normal file
@ -0,0 +1,28 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
try:
|
||||
from setuptools import setup
|
||||
except ImportError:
|
||||
from distutils.core import setup
|
||||
|
||||
import libgrabsite
|
||||
|
||||
setup(
|
||||
name="grab-site",
|
||||
version=libgrabsite.__version__,
|
||||
description="The archivist's web crawler: WARC output, dashboard for all crawls, dynamic ignore patterns",
|
||||
url="https://github.com/ludios/grab-site",
|
||||
author="Ivan Kozik",
|
||||
author_email="ivan@ludios.org",
|
||||
classifiers=[
|
||||
"Programming Language :: Python :: 3",
|
||||
"Development Status :: 3 - Alpha",
|
||||
"Intended Audience :: End Users/Desktop",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Topic :: Internet :: WWW/HTTP",
|
||||
],
|
||||
scripts=["grab-site", "gs-server", "patched-wpull"],
|
||||
packages=["libgrabsite"],
|
||||
package_data={"libgrabsite": ["*.html"]},
|
||||
install_requires=["wpull", "manhole", "lmdb", "autobahn", "aiohttp", "trollius"],
|
||||
)
|
Loading…
x
Reference in New Issue
Block a user