Add chardet_confidence_limit option for 'auto' encoding setting.

This commit is contained in:
Jim Miller 2017-04-29 18:34:08 -05:00
parent 7f4bc5c36e
commit d2feac0c66
3 changed files with 12 additions and 1 deletions

View File

@ -123,6 +123,10 @@ include_tocpage: true
## it has +90% confidence. 'auto' is not reliable.
#website_encodings: utf8, Windows-1252, iso-8859-1
## When using 'auto' in website_encodings, you can tweak the
## confidence required to use the chardet detected.
#chardet_confidence_limit:0.9
## entries to make epub subjects and calibre tags
## lastupdate creates two tags: "Last Update Year/Month: %Y/%m" and "Last Update: %Y/%m/%d"
include_subject_tags: extratags, genre, category, characters, ships, status

View File

@ -354,6 +354,7 @@ def get_valid_keywords():
'include_subject_tags',
'include_titlepage',
'include_tocpage',
'chardet_confidence_limit',
'is_adult',
'join_string_authorHTML',
'keep_style_attr',
@ -912,9 +913,11 @@ class Configuration(ConfigParser.SafeConfigParser):
continue
detected = chardet.detect(data)
#print detected
if detected['confidence'] > 0.9:
if detected['confidence'] > float(self.getConfig("chardet_confidence_limit",0.9)):
logger.debug("using chardet detected encoding:%s(%s)"%(detected['encoding'],detected['confidence']))
code=detected['encoding']
else:
logger.debug("chardet confidence too low:%s(%s)"%(detected['encoding'],detected['confidence']))
continue
return data.decode(code)
except:

View File

@ -123,6 +123,10 @@ include_tocpage: true
## it has +90% confidence. 'auto' is not reliable.
#website_encodings: utf8, Windows-1252, iso-8859-1
## When using 'auto' in website_encodings, you can tweak the
## confidence required to use the chardet detected.
#chardet_confidence_limit:0.9
## python string Template, string with ${title}, ${author} etc, same as titlepage_entries
## Can include directories.
#output_filename: books/${title}-${siteabbrev}_${storyId}${formatext}