anime-downloader/anime_downloader/animeinfo.py

220 lines
8.0 KiB
Python

from anime_downloader.sites import helpers
import logging
from anime_downloader.sites.anime import Anime, AnimeEpisode, SearchResult
from anime_downloader.sites import get_anime_class
from anime_downloader.config import Config
from anime_downloader.util import primitive_search
import warnings
with warnings.catch_warnings():
warnings.simplefilter('ignore')
from fuzzywuzzy import fuzz
logger = logging.getLogger(__name__)
class AnimeInfo:
"""
Attributes
----------
url: string
URL for the info page
title: string
English name of the show.
jp_title: string
Japanase name of the show.
metadata: dict
Data not critical for core functions
episodes: int
Max amount of episodes
"""
def __init__(self, url, episodes, title=None, jp_title=None, metadata={}):
self.url = url
self.episodes = episodes
self.title = title
self.jp_title = jp_title
self.metadata = metadata
class MatchObject:
"""
Attributes
----------
AnimeInfo: object
Metadata object from the MAL search.
SearchResult: object
Metadata object from the provider search
ratio: int
A number between 0-100 describing the similarities between SearchResult and AnimeInfo.
Higher number = more similar.
"""
def __init__(self, AnimeInfo, SearchResult, ratio=100):
self.AnimeInfo = AnimeInfo
self.SearchResult = SearchResult
self.ratio = ratio
# Not used
def search_mal(query):
def search(query):
soup = helpers.soupify(helpers.get('https://myanimelist.net/anime.php', params={'q': query}))
search_results = soup.select("a.hoverinfo_trigger.fw-b.fl-l")
return [SearchResult(
url=i.get('href'),
title=i.select('strong')[0].text
) for i in search_results]
def scrape_metadata(url):
soup = helpers.soupify(helpers.get(url))
"""
info_dict contains something like this: [{
'url': 'https://myanimelist.net/anime/37779/Yakusoku_no_Neverland',
'title': 'The Promised Neverland',
'jp_title': '約束のネバーランド'
},{
'url': 'https://myanimelist.net/anime/39617/Yakusoku_no_Neverland_2nd_Season',
'title': 'The Promised Neverland 2nd Season',
'jp_title': '約束のネバーランド 第2期'}]
"""
info_dict = {
'url': url
}
# Maps specified info in sidebar to variables in info_dict
name_dict = {
'Japanese:': 'jp_title',
'English:': 'title',
'synonyms:': 'synonyms',
'Episodes:': 'episodes'
}
info = soup.select('span.dark_text')
extra_info = [i.parent.text.strip() for i in info]
for i in extra_info:
text = i.replace('\n', '').strip()
for j in name_dict:
if text.startswith(j):
info_dict[name_dict[j]] = text[len(j):].strip()
# Backup name if no English name isn't registered in sidebar
if not info_dict.get('title'):
name = soup.select('span[itemprop=name]')
info_dict['title'] = name[0].text if name else None
# Always sets episodes
if not info_dict.get('episodes') or info_dict.get('episodes') == 'Unknown':
info_dict['episodes'] = 0
# TODO error message when this stuff is not correctly scraped
# Can happen if MAL is down or something similar
return AnimeInfo(url=info_dict['url'], title=info_dict.get('title'),
jp_title=info_dict.get('jp_title'), episodes=int(info_dict['episodes']))
search_results = search(query)
season_info = []
# Max 10 results
for i in range(min(len(search_results), 10)):
anime_info = scrape_metadata(search_results[i].url)
if anime_info.episodes:
season_info.append(anime_info)
# Code below uses the first result to compare
#season_info = [scrape_metadata(search_results[0].url)]
# return season_info
# Prompts the user for selection
return primitive_search(season_info)
# Choice allows the user to preselect, used to download from a list overnight.
# None prompts the user.
def search_anilist(query, choice=None):
def search(query):
ani_query = """
query ($id: Int, $page: Int, $search: String, $type: MediaType) {
Page (page: $page, perPage: 10) {
media (id: $id, search: $search, type: $type) {
id
idMal
description(asHtml: false)
seasonYear
title {
english
romaji
native
}
coverImage {
extraLarge
}
bannerImage
averageScore
status
episodes
}
}
}
"""
url = 'https://graphql.anilist.co'
# TODO check in case there's no results
# It seems to error on no results (anime -ll DEBUG dl "nev")
results = helpers.post(url, json={'query': ani_query, 'variables': {'search': query, 'page': 1, 'type': 'ANIME'}}).json()['data']['Page']['media']
if not results:
logger.error('No results found in anilist')
raise NameError
search_results = [AnimeInfo(url='https://anilist.co/anime/' + str(i['id']), title=i['title']['romaji'],
jp_title=i['title']['native'], episodes=int(i['episodes']), metadata=i) for i in results if i['episodes'] != None]
return search_results
search_results = search(query)
# This can also be fuzzied, but too many options.
if choice != None:
# Fixes too low or high to get a real value.
fixed_choice = ((choice - 1) % len(search_results))
return search_results[fixed_choice]
else:
# Prompts the user for selection
return primitive_search(search_results)
def fuzzy_match_metadata(seasons_info, search_results):
# Gets the SearchResult object with the most similarity title-wise to the first MAL/Anilist result
results = []
for i in seasons_info:
for j in search_results:
# Allows for returning of cleaned title by the provider using 'title_cleaned' in meta_info.
# To make fuzzy matching better.
title_provider = j.title.strip() if not j.meta_info.get('title_cleaned') else j.meta_info.get('title_cleaned').strip()
# On some titles this will be None
# causing errors below
title_info = i.title
# Essentially adds the chosen key to the query if the version is in use
# Dirty solution, but should work pretty well
config = Config['siteconfig'].get(get_anime_class(j.url).sitename, {})
version = config.get('version')
version_use = version == 'dubbed'
# Adds something like (Sub) or (Dub) to the title
key_used = j.meta_info.get('version_key_dubbed', '') if version_use else j.meta_info.get('version_key_subbed', '')
title_info += ' ' + key_used
title_info = title_info.strip()
# TODO add synonyms
# 0 if there's no japanese name
jap_ratio = fuzz.ratio(i.jp_title, j.meta_info['jp_title']) if j.meta_info.get('jp_title') else 0
# Outputs the max ratio for japanese or english name (0-100)
ratio = max(fuzz.ratio(title_info, title_provider), jap_ratio)
logger.debug('Ratio: {}, Info title: {}, Provider Title: {}, Key used: {}'.format(ratio, title_info, title_provider, key_used))
results.append(MatchObject(i, j, ratio))
# Returns the result with highest ratio
return max(results, key=lambda item: item.ratio)