youtube-dl-gui/devscripts/check-translation.py

315 lines
9.4 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Author: Sotiris Papadopoulos <ytubedlg@gmail.com>
Last-Revision: 2017-04-19
Script to automatically check PO files
"""
from __future__ import unicode_literals
import os
import sys
import logging
import argparse
from time import sleep
from datetime import datetime, timedelta, tzinfo
try:
import polib
import google_translate
except ImportError as error:
print(error)
sys.exit(1)
WTIME = 2.0 # Time in seconds to wait between requests to avoid ban
PACKAGE = "youtube_dl_gui"
PO_FILENAME = "{}.po".format(PACKAGE)
LOCALE_PATH_TMPL = os.path.join(PACKAGE, "locale", "{lang}", "LC_MESSAGES", PO_FILENAME)
logging.basicConfig(level=logging.ERROR)
def parse():
"""Parse command line arguments."""
parser = argparse.ArgumentParser(description="Script to automatically check PO files")
parser.add_argument("language", help="language of the PO file to check")
parser.add_argument("-w", "--werror", action="store_true", help="treat all warning messages as errors")
parser.add_argument("-o", "--only-headers", action="store_true", help="check only the PO file headers")
parser.add_argument("-n", "--no-translate", action="store_true", help="do not use the translator to check 'msgstr' fields")
parser.add_argument("-t", "--tlang", help="force a different language on the translator than the one given")
return parser.parse_args()
class UTC_Offset_Timezone(tzinfo):
"""Class that represents a UTC offset in the format +/-0000."""
def __init__(self, offset_string):
self.offset = timedelta(seconds=UTC_Offset_Timezone.parse_offset(offset_string))
def utcoffset(self, dt):
return self.offset + self.dst(dt)
def dst(self, dt):
return timedelta(0)
@staticmethod
def parse_offset(offset_string):
"""Parse the offset string into seconds."""
if len(offset_string) != 5:
raise ValueError("Invalid length for offset string ({})".format(offset_string))
hours = offset_string[1:3]
minutes = offset_string[3:5]
offset = int(hours) * 3600 + int(minutes) * 60
if offset_string[0] == "-":
return -1 * offset
return offset
def parse_date(date_string):
"""Parse date string into an aware datetime object."""
# Just a small list with the most common timezones
offset_list = [
("EEST", "0300"),
("EET", "0200"),
("GMT", "0000"),
("UTC", "0000")
]
# Replace all the timezones with the offset
for item in offset_list:
timezone, offset = item
date_string = date_string.replace(timezone, offset)
datetime_string = date_string[:16]
offset_string = date_string[16:]
naive_date = datetime.strptime(datetime_string, "%Y-%m-%d %H:%M")
# Create & return an aware datetime object based on the offset
return naive_date.replace(tzinfo=UTC_Offset_Timezone(offset_string))
# Print helpers
def my_print(msg, char="*", value=None, exit=False):
"""Print 'msg', debug 'value' and exit if 'exit' is True."""
print("[{}] {}".format(char, msg))
if value is not None:
print("\tvalue= \"{}\"".format(value))
if exit:
sys.exit(1)
def perror(msg, value=None):
my_print(msg, "-", value, True)
def pwarn(msg, value=None, exit=False):
my_print(msg, "!", value, exit)
def pinfo(msg):
my_print(msg)
#############################
def main(args):
os.chdir("..")
# setup
pot_file_path = LOCALE_PATH_TMPL.format(lang="en_US")
po_file_path = LOCALE_PATH_TMPL.format(lang=args.language)
if not os.path.exists(pot_file_path):
perror("Failed to locate POT file, exiting...", pot_file_path)
if not os.path.exists(po_file_path):
perror("Failed to locate PO file, exiting...", po_file_path)
pot_file = polib.pofile(pot_file_path)
po_file = polib.pofile(po_file_path)
# check headers
pinfo("Checking PO headers")
pot_headers = pot_file.metadata
po_headers = po_file.metadata
if pot_headers["Project-Id-Version"] != po_headers["Project-Id-Version"]:
pwarn("'Project-Id-Version' headers do not match", exit=args.werror)
if pot_headers["POT-Creation-Date"] != po_headers["POT-Creation-Date"]:
pwarn("'POT-Creation-Date' headers do not match", exit=args.werror)
po_creation_date = parse_date(po_headers["POT-Creation-Date"])
po_revision_date = parse_date(po_headers["PO-Revision-Date"])
# Aware datetimes convert to UTC automatically when comparing
if po_revision_date <= po_creation_date:
pwarn("PO file seems outdated", exit=args.werror)
if "Language" in po_headers and po_headers["Language"] != args.language:
pwarn("'Language' header does not match with the given language", po_headers["Language"], args.werror)
pinfo("Last-Translator: {}".format(po_headers["Last-Translator"]))
# check translations
if args.only_headers:
sys.exit(0)
pinfo("Checking translations, this might take a while...")
pot_msgid = [entry.msgid for entry in pot_file]
po_msgid = [entry.msgid for entry in po_file]
# lists to hold reports
missing_msgid = []
not_translated = []
same_msgstr = []
with_typo = []
verify_trans = []
fuzzy_trans = po_file.fuzzy_entries()
for msgid in pot_msgid:
if msgid not in po_msgid:
missing_msgid.append(msgid)
# Init translator only if the '--no-translate' flag is NOT set
translator = None
if not args.no_translate:
eta = timedelta(seconds=len(pot_file) * WTIME)
pinfo("Approximate time to check translations online: {}".format(eta))
translator = google_translate.GoogleTranslator(timeout=5.0, retries=2, wait_time=WTIME)
# Set source language for GoogleTranslator
if args.tlang is not None:
src_lang = args.tlang
pinfo("Forcing '{}' as the translator's source language".format(src_lang))
else:
# Get a valid source language for Google
# for example convert 'ar_SA' to 'ar' or 'zh_CN' to 'zh-CN'
src_lang = args.language
if src_lang not in translator._lang_dict:
src_lang = src_lang.replace("_", "-")
if src_lang not in translator._lang_dict:
src_lang = src_lang.split("-")[0]
# Keep entries that need further analysis using the translator
further_analysis = []
for entry in po_file:
if not entry.translated():
not_translated.append(entry)
elif entry.msgid == entry.msgstr:
same_msgstr.append(entry)
else:
further_analysis.append(entry)
if translator is not None:
# Pass translations as a list since GoogleTranslator can handle them
words_dict = translator.get_info_dict([entry.msgstr for entry in further_analysis], "en", src_lang)
for index, word_dict in enumerate(words_dict):
# Get the corresponding POEntry since the words_dict does not contain those
entry = further_analysis[index]
if word_dict is not None:
if word_dict["has_typo"]:
with_typo.append(entry)
if word_dict["translation"].lower() != entry.msgid.lower():
found = False
# Check verbs, nouns, adverbs, etc..
for key in word_dict["extra"]:
if entry.msgid.lower() in word_dict["extra"][key].keys():
found = True
break
if not found:
verify_trans.append((entry, word_dict["translation"]))
# time to report
print("=" * 25 + "Report" + "=" * 25)
if missing_msgid:
print("Missing msgids")
for msgid in missing_msgid:
print(" \"{}\"".format(msgid))
if not_translated:
print("Not translated")
for entry in not_translated:
print(" line: {} msgid: \"{}\"".format(entry.linenum, entry.msgid))
if same_msgstr:
print("Same msgstr")
for entry in same_msgstr:
print(" line: {} msgid: \"{}\"".format(entry.linenum, entry.msgid))
if with_typo:
print("With typo")
for entry in with_typo:
print(" line: {} msgid: \"{}\" msgstr: \"{}\"".format(entry.linenum, entry.msgid, entry.msgstr))
if verify_trans:
print("Verify translation")
for item in verify_trans:
entry, translation = item
print(" line: {} msgid: \"{}\" trans: \"{}\"".format(entry.linenum, entry.msgid, translation))
if fuzzy_trans:
print("Fuzzy translations")
for entry in fuzzy_trans:
print(" line: {} msgid: \"{}\"".format(entry.linenum, entry.msgid))
total = len(missing_msgid) + len(not_translated) + len(same_msgstr) + len(with_typo) + len(verify_trans) + len(fuzzy_trans)
print("")
print("Missing msgids\t\t: {}".format(len(missing_msgid)))
print("Not translated\t\t: {}".format(len(not_translated)))
print("Same msgstr\t\t: {}".format(len(same_msgstr)))
print("With typo\t\t: {}".format(len(with_typo)))
print("Verify translation\t: {}".format(len(verify_trans)))
print("Fuzzy translations\t: {}".format(len(fuzzy_trans)))
print("Total\t\t\t: {}".format(total))
if __name__ == "__main__":
try:
main(parse())
except KeyboardInterrupt:
print("KeyboardInterrupt")