anime-downloader/anime_downloader/animeinfo.py

from anime_downloader.sites import helpers
import logging
from anime_downloader.sites.anime import Anime, AnimeEpisode, SearchResult
from anime_downloader.sites import get_anime_class
from anime_downloader.config import Config
from anime_downloader.util import primitive_search

import warnings
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    from fuzzywuzzy import fuzz

logger = logging.getLogger(__name__)


class AnimeInfo:
    """
    Attributes
    ----------
    url: string
        URL for the info page
    title: string
        English name of the show.
    jp_title: string
        Japanase name of the show.
    metadata: dict
        Data not critical for core functions
    episodes: int
        Max amount of episodes
    """

    def __init__(self, url, episodes, title=None, jp_title=None, metadata={}):
        self.url = url
        self.episodes = episodes
        self.title = title
        self.jp_title = jp_title
        self.metadata = metadata


class MatchObject:
    """
    Attributes
    ----------
    AnimeInfo: object
        Metadata object from the MAL search.
    SearchResult: object
        Metadata object from the provider search
    ratio: int
        A number between 0-100 describing the similarities between SearchResult and AnimeInfo.
        Higher number = more similar.
    """

    def __init__(self, AnimeInfo, SearchResult, ratio=100):
        self.AnimeInfo = AnimeInfo
        self.SearchResult = SearchResult
        self.ratio = ratio

# Not used


def search_mal(query):

    def search(query):
        soup = helpers.soupify(helpers.get('https://myanimelist.net/anime.php', params={'q': query}))
        search_results = soup.select("a.hoverinfo_trigger.fw-b.fl-l")
        return [SearchResult(
            url=i.get('href'),
            title=i.select('strong')[0].text
        ) for i in search_results]

    def scrape_metadata(url):
        soup = helpers.soupify(helpers.get(url))
        """
        info_dict contains something like this: [{
        'url': 'https://myanimelist.net/anime/37779/Yakusoku_no_Neverland',
        'title': 'The Promised Neverland',
        'jp_title': '約束のネバーランド'
        },{
        'url': 'https://myanimelist.net/anime/39617/Yakusoku_no_Neverland_2nd_Season',
        'title': 'The Promised Neverland 2nd Season',
        'jp_title': '約束のネバーランド 第2期'}]
        """
        info_dict = {
            'url': url
        }

        # Maps specified info in sidebar to variables in info_dict
        name_dict = {
            'Japanese:': 'jp_title',
            'English:': 'title',
            'synonyms:': 'synonyms',
            'Episodes:': 'episodes'
        }
        info = soup.select('span.dark_text')
        extra_info = [i.parent.text.strip() for i in info]
        for i in extra_info:
            text = i.replace('\n', '').strip()
            for j in name_dict:
                if text.startswith(j):
                    info_dict[name_dict[j]] = text[len(j):].strip()

        # Backup name if no English name isn't registered in sidebar
        if not info_dict.get('title'):
            name = soup.select('span[itemprop=name]')
            info_dict['title'] = name[0].text if name else None

        # Always sets episodes
        if not info_dict.get('episodes') or info_dict.get('episodes') == 'Unknown':
            info_dict['episodes'] = 0

        # TODO error message when this stuff is not correctly scraped
        # Can happen if MAL is down or something similar
        return AnimeInfo(url=info_dict['url'], title=info_dict.get('title'),
                         jp_title=info_dict.get('jp_title'), episodes=int(info_dict['episodes']))

    search_results = search(query)
    season_info = []
    # Max 10 results
    for i in range(min(len(search_results), 10)):
        anime_info = scrape_metadata(search_results[i].url)
        if anime_info.episodes:
            season_info.append(anime_info)

    # Code below uses the first result to compare
    #season_info = [scrape_metadata(search_results[0].url)]
    # return season_info

    # Prompts the user for selection
    return primitive_search(season_info)

# Choice allows the user to preselect, used to download from a list overnight.
# None prompts the user.


def search_anilist(query, choice=None):

    def search(query):
        ani_query = """
            query ($id: Int, $page: Int, $search: String, $type: MediaType) {
                Page (page: $page, perPage: 10) {
                    media (id: $id, search: $search, type: $type) {
                        id
                        idMal
                        description(asHtml: false)
                        seasonYear
                        title {
                            english
                            romaji
                            native
                        }
                        coverImage {
                            extraLarge
                        }
                        bannerImage
                        averageScore
                        status
                        episodes
                        }
                    }
                }
            """
        url = 'https://graphql.anilist.co'

        # TODO check in case there's no results
        # It seems to error on no results (anime -ll DEBUG dl "nev")
        results = helpers.post(url, json={'query': ani_query, 'variables': {'search': query, 'page': 1, 'type': 'ANIME'}}).json()['data']['Page']['media']
        if not results:
            logger.error('No results found in anilist')
            raise NameError

        search_results = [AnimeInfo(url='https://anilist.co/anime/' + str(i['id']), title=i['title']['romaji'],
                                    jp_title=i['title']['native'], episodes=int(i['episodes']), metadata=i) for i in results if i['episodes'] != None]
        return search_results

    search_results = search(query)

    # This can also be fuzzied, but too many options.
    if choice != None:
        # Fixes too low or high to get a real value.
        fixed_choice = ((choice - 1) % len(search_results))
        return search_results[fixed_choice]
    else:
        # Prompts the user for selection
        return primitive_search(search_results)


def fuzzy_match_metadata(seasons_info, search_results):
    # Gets the SearchResult object with the most similarity title-wise to the first MAL/Anilist result
    results = []
    for i in seasons_info:
        for j in search_results:
            # Allows for returning of cleaned title by the provider using 'title_cleaned' in meta_info.
            # To make fuzzy matching better.
            title_provider = j.title.strip() if not j.meta_info.get('title_cleaned') else j.meta_info.get('title_cleaned').strip()
            # On some titles this will be None
            # causing errors below
            title_info = i.title

            # Essentially adds the chosen key to the query if the version is in use
            # Dirty solution, but should work pretty well

            config = Config['siteconfig'].get(get_anime_class(j.url).sitename, {})
            version = config.get('version')
            version_use = version == 'dubbed'
            # Adds something like (Sub) or (Dub) to the title
            key_used = j.meta_info.get('version_key_dubbed', '') if version_use else j.meta_info.get('version_key_subbed', '')
            title_info += ' ' + key_used
            title_info = title_info.strip()

            # TODO add synonyms
            # 0 if there's no japanese name
            jap_ratio = fuzz.ratio(i.jp_title, j.meta_info['jp_title']) if j.meta_info.get('jp_title') else 0
            # Outputs the max ratio for japanese or english name (0-100)
            ratio = max(fuzz.ratio(title_info, title_provider), jap_ratio)
            logger.debug('Ratio: {}, Info title: {}, Provider Title: {}, Key used: {}'.format(ratio, title_info, title_provider, key_used))
            results.append(MatchObject(i, j, ratio))

    # Returns the result with highest ratio
    return max(results, key=lambda item: item.ratio)