Add chardet_confidence_limit option for 'auto' encoding setting.
This commit is contained in:
parent
7f4bc5c36e
commit
d2feac0c66
@ -123,6 +123,10 @@ include_tocpage: true
|
|||||||
## it has +90% confidence. 'auto' is not reliable.
|
## it has +90% confidence. 'auto' is not reliable.
|
||||||
#website_encodings: utf8, Windows-1252, iso-8859-1
|
#website_encodings: utf8, Windows-1252, iso-8859-1
|
||||||
|
|
||||||
|
## When using 'auto' in website_encodings, you can tweak the
|
||||||
|
## confidence required to use the chardet detected.
|
||||||
|
#chardet_confidence_limit:0.9
|
||||||
|
|
||||||
## entries to make epub subjects and calibre tags
|
## entries to make epub subjects and calibre tags
|
||||||
## lastupdate creates two tags: "Last Update Year/Month: %Y/%m" and "Last Update: %Y/%m/%d"
|
## lastupdate creates two tags: "Last Update Year/Month: %Y/%m" and "Last Update: %Y/%m/%d"
|
||||||
include_subject_tags: extratags, genre, category, characters, ships, status
|
include_subject_tags: extratags, genre, category, characters, ships, status
|
||||||
|
@ -354,6 +354,7 @@ def get_valid_keywords():
|
|||||||
'include_subject_tags',
|
'include_subject_tags',
|
||||||
'include_titlepage',
|
'include_titlepage',
|
||||||
'include_tocpage',
|
'include_tocpage',
|
||||||
|
'chardet_confidence_limit',
|
||||||
'is_adult',
|
'is_adult',
|
||||||
'join_string_authorHTML',
|
'join_string_authorHTML',
|
||||||
'keep_style_attr',
|
'keep_style_attr',
|
||||||
@ -912,9 +913,11 @@ class Configuration(ConfigParser.SafeConfigParser):
|
|||||||
continue
|
continue
|
||||||
detected = chardet.detect(data)
|
detected = chardet.detect(data)
|
||||||
#print detected
|
#print detected
|
||||||
if detected['confidence'] > 0.9:
|
if detected['confidence'] > float(self.getConfig("chardet_confidence_limit",0.9)):
|
||||||
|
logger.debug("using chardet detected encoding:%s(%s)"%(detected['encoding'],detected['confidence']))
|
||||||
code=detected['encoding']
|
code=detected['encoding']
|
||||||
else:
|
else:
|
||||||
|
logger.debug("chardet confidence too low:%s(%s)"%(detected['encoding'],detected['confidence']))
|
||||||
continue
|
continue
|
||||||
return data.decode(code)
|
return data.decode(code)
|
||||||
except:
|
except:
|
||||||
|
@ -123,6 +123,10 @@ include_tocpage: true
|
|||||||
## it has +90% confidence. 'auto' is not reliable.
|
## it has +90% confidence. 'auto' is not reliable.
|
||||||
#website_encodings: utf8, Windows-1252, iso-8859-1
|
#website_encodings: utf8, Windows-1252, iso-8859-1
|
||||||
|
|
||||||
|
## When using 'auto' in website_encodings, you can tweak the
|
||||||
|
## confidence required to use the chardet detected.
|
||||||
|
#chardet_confidence_limit:0.9
|
||||||
|
|
||||||
## python string Template, string with ${title}, ${author} etc, same as titlepage_entries
|
## python string Template, string with ${title}, ${author} etc, same as titlepage_entries
|
||||||
## Can include directories.
|
## Can include directories.
|
||||||
#output_filename: books/${title}-${siteabbrev}_${storyId}${formatext}
|
#output_filename: books/${title}-${siteabbrev}_${storyId}${formatext}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user