Merge pull request #222 from JustAnotherArchivist/warc-header-gs-version

Record grab-site version in WARC headers
master
Ivan Kozik 2022-06-27 16:15:01 -07:00 committed by GitHub
commit df06e14415
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 4 additions and 1 deletions

View File

@ -6,7 +6,7 @@ from wpull.database.sqltable import SQLiteURLTable
from wpull.document.html import HTMLReader
from wpull.processor.rule import ProcessingRule
from libgrabsite import dupespotter
from libgrabsite import dupespotter, __version__
from libgrabsite.dupes import DupesOnDisk
@ -58,6 +58,9 @@ class DupeSpottingProcessingRule(ProcessingRule):
def activate(app_session):
app_session.factory.class_map['URLTableImplementation'] = NoFsyncSQLTable
warc_recorder_cls = app_session.factory.class_map['WARCRecorder']
warc_recorder_cls.DEFAULT_SOFTWARE_STRING = f'grab-site/{__version__} ' + warc_recorder_cls.DEFAULT_SOFTWARE_STRING
if int(os.environ["DUPESPOTTER_ENABLED"]):
dupes_db_location = os.path.join(os.environ["GRAB_SITE_WORKING_DIR"], "dupes_db")
dupes_db = DupesOnDisk(dupes_db_location)