From f9e2e8ce2c5ed1da0ce4f55b13785a55c64674bb Mon Sep 17 00:00:00 2001 From: Arjix <53124886+ArjixGamer@users.noreply.github.com> Date: Mon, 3 May 2021 20:16:18 +0300 Subject: [PATCH] improved the search results --- anime_downloader/sites/animtime.py | 108 ++++++++++++++++++++--------- 1 file changed, 76 insertions(+), 32 deletions(-) diff --git a/anime_downloader/sites/animtime.py b/anime_downloader/sites/animtime.py index af46f24..5908aed 100644 --- a/anime_downloader/sites/animtime.py +++ b/anime_downloader/sites/animtime.py @@ -6,36 +6,79 @@ from difflib import get_close_matches import re +def format_title_case(text): + """ + Will format text to title case and in will have roman numbers in capital case + only I is supported so only up to III, any number bigger than that will keep its original capitalization case + """ + words = text.split() + new_text = [] + + for word in words: + if word.lower().replace('i', '') == '': + new_text += ['I' * len(word)] + continue + + elif word.lower() == 'dub': + new_text += ['(Dub)'] + continue + + new_text += [word.title()] + + return ' '.join(new_text) + + +def get_title_dict(script): + """ + Returns a tuple with two dictionaries + the 1st one has the anime slugs with their pretty title + and the 2nd one has the anime slugs with their ids + """ + script_text = helpers.get(script).text + title_function = re.search("tm=.*?}", script_text).group() + titles_dict = { + x[0]: format_title_case(x[1].replace('-', ' ')) + for x in re.findall(r"qd\[tm\.(.*?)\]=.*?\".*?/animtime/(.*?)/", script_text) + } + id_dict = { + x[0]: x[1] + for x in re.findall(r"t\[t\.(.*?)=(\d+)", title_function) + } + + for title in id_dict: + """ + For any anime that are not matched in the pretty titles dictionary (titles_dict) + + for example Bleach (with the id of 1 is not in titles_dict) + """ + if title not in titles_dict: + titles_dict[title] = ' '.join( + re.sub(r"([A-Z])", r" \1", title).split()) + + return titles_dict, id_dict + + +def get_script_link(): + soup = helpers.soupify(helpers.get('https://animtime.com')) + script = 'https://animtime.com/' + \ + soup.select('script[src*=main]')[0].get('src') + + return script + + class AnimTime(Anime, sitename='animtime'): sitename = 'animtime' - @classmethod - def get_title_dict(cls, script): - script_text = helpers.get(script).text - title_function = re.search("tm=.*?}", script_text).group() - titles_regexed = re.findall("t\[t\.(.*?)=(\d+)", title_function) - titles = dict([(' '.join(re.sub(r"([A-Z])", r" \1", x[0]).split()), x[1]) - for x in titles_regexed]) - - return titles - - @classmethod - def get_script_link(cls): - soup = helpers.soupify(helpers.get('https://animtime.com')) - script = 'https://animtime.com/' + \ - soup.select('script[src*=main]')[0].get('src') - - return script - @classmethod def search(cls, query): - titles = cls.get_title_dict(cls.get_script_link()) - matches = get_close_matches(query, titles, cutoff=0.2) + titles = get_title_dict(get_script_link()) + matches = get_close_matches(query, titles[0], cutoff=0.2) search_results = [ SearchResult( - title=match, - url='https://animtime.com/title/{}'.format(titles.get(match)) + title=titles[0].get(match), + url='https://animtime.com/title/{}'.format( + titles[1].get(match)) ) for match in matches ] @@ -43,13 +86,13 @@ class AnimTime(Anime, sitename='animtime'): return search_results def _scrape_episodes(self): - link = self.get_script_link() - titles = dict((y, x) for x, y in self.get_title_dict(link).items()) + link = get_script_link() + titles = dict((y, x) for x, y in get_title_dict(link)[1].items()) current_title = titles.get(self.url.split('/')[-1]) script_text = helpers.get(link).text ep_count = int(re.search( - "zd\[tm\.{}\]=(\d+)".format(current_title.replace(' ', '')), script_text).group(1)) + r"\[tm\.{}\]=(\d+)".format(current_title.replace(' ', '')), script_text).group(1)) episodes = [] for i in range(ep_count): @@ -58,19 +101,20 @@ class AnimTime(Anime, sitename='animtime'): return episodes def _scrape_metadata(self): - titles = dict((y, x) for (x, y) in self.get_title_dict( - self.get_script_link()).items()) - self.title = titles.get(self.url.split('/')[-1]) + titles = get_title_dict(get_script_link())[1] + self.title = next(x for x, y in titles.items() + if int(y) == int(self.url.split('/')[-1])) class AnimTimeEpisode(AnimeEpisode, sitename='animtime'): def _get_sources(self): - titles = dict((y, x) for x, y in AnimTime.get_title_dict( - AnimTime.get_script_link()).items()) - current_title = titles.get(self.url.split('/')[-3]) + titles = get_title_dict(get_script_link())[1] + + current_title = next(x for x, y in titles.items() + if int(y) == int(self.url.split('/')[-3])) current_ep = "{0:03}".format(int(self.url.split('/')[-1])) - script_text = helpers.get(AnimTime.get_script_link()).text + script_text = helpers.get(get_script_link()).text regexed_link = re.search('tm\.' + current_title.replace(" ", "") + '\]=function\(.*?return.*?(https.*?)"}', script_text).group(1) link = regexed_link.replace('"+t+"', current_ep)