From f3aae408cab0e547ba21f5365fe367de2876cce6 Mon Sep 17 00:00:00 2001 From: nate-moo <40650681+nate-moo@users.noreply.github.com> Date: Sun, 16 May 2021 01:00:12 -0400 Subject: [PATCH 1/8] Kwik Fixes --- anime_downloader/extractors/kwik.py | 130 ++++++++++++++++++++-------- anime_downloader/sites/animepahe.py | 2 +- anime_downloader/sites/init.py | 1 + 3 files changed, 98 insertions(+), 35 deletions(-) diff --git a/anime_downloader/extractors/kwik.py b/anime_downloader/extractors/kwik.py index 37bdaf0..0e157e1 100644 --- a/anime_downloader/extractors/kwik.py +++ b/anime_downloader/extractors/kwik.py @@ -1,10 +1,15 @@ import logging +from platform import node import re +import subprocess import requests +import tempfile from anime_downloader.extractors.base_extractor import BaseExtractor +from anime_downloader.sites.helpers.request import temp_dir from anime_downloader.sites import helpers from anime_downloader import util +from anime_downloader.util import eval_in_node from subprocess import CalledProcessError logger = logging.getLogger(__name__) @@ -18,55 +23,112 @@ class Kwik(BaseExtractor): ''' def _get_data(self): + ld = logger.debug # Kwik servers don't have direct link access you need to be referred # from somewhere, I will just use the url itself. We then # have to rebuild the url. Hopefully kwik doesn't block this too # Necessary - self.url = self.url.replace(".cx/e/", ".cx/f/") - self.headers.update({"referer": self.url}) + #ld(self.url) + #self.url = self.url.replace(".cx/e/", ".cx/f/") + #self.headers.update({"referer": self.url}) - cookies = util.get_hcaptcha_cookies(self.url) + headers = {"Referer": "https://kwik.cx/"} - if not cookies: - resp = util.bypass_hcaptcha(self.url) - else: - resp = requests.get(self.url, cookies=cookies) + + + res = requests.get(self.url, headers=headers) - title_re = re.compile(r'title>(.*)<') + #ld(res.text) - kwik_text = resp.text - deobfuscated = None + evalText = helpers.soupify(res.text) - loops = 0 - while not deobfuscated and loops < 6: - try: - deobfuscated = helpers.soupify(util.deobfuscate_packed_js(re.search(r'<(script).*(var\s+_.*escape.*?)\1>(?s)', kwik_text).group(2))) - except (AttributeError, CalledProcessError) as e: - if type(e) == AttributeError: - resp = util.bypass_hcaptcha(self.url) - kwik_text = resp.text + scripts = evalText.select("script") - if type(e) == CalledProcessError: - resp = requests.get(self.url, cookies=cookies) - finally: - cookies = resp.cookies - title = title_re.search(kwik_text).group(1) - loops += 1 + for i in scripts: + rexd = re.compile("", "") + break - post_url = deobfuscated.form["action"] - token = deobfuscated.input["value"] + tf = tempfile.mktemp(dir=temp_dir) - resp = helpers.post(post_url, headers=self.headers, params={"_token": token}, cookies=cookies, allow_redirects=False) - stream_url = resp.headers["Location"] + with open(tf, 'w', encoding="utf-8") as f: + f.write(rexd) + + #print(tf) - logger.debug('Stream URL: %s' % stream_url) + #ld(nodeRes) + + nodeRes = str(subprocess.getoutput(f"node {tf}")) + + ld(nodeRes) + + stream_url = re.search(r"source='([^;]*)';", nodeRes).group().replace("source='", "").replace("';", "") + #reg = re.compile("[\s\S]*") + + ld(stream_url) + + #kwik_text = resp.text + + #title_re = re.compile(r'title>(.*)<') + #title = title_re.search(kwik_text).group(1) return { 'stream_url': stream_url, - 'meta': { - 'title': title, - 'thumbnail': '' - }, - 'referer': None +# 'meta': { +# 'title': title, +# 'thumbnail': '' +# }, + 'referer': "https://kwik.cx/" } + + + + + #cookies = util.get_hcaptcha_cookies(self.url) + + #if not cookies: + # resp = util.bypass_hcaptcha(self.url) + #else: + # resp = requests.get(self.url, cookies=cookies) + + + + # + #deobfuscated = None + + #loops = 0 + #while not deobfuscated and loops < 6: + # try: + # deobfuscated = helpers.soupify(util.deobfuscate_packed_js(re.search(r'<(script).*(var\s+_.*escape.*?)\1>(?s)', kwik_text).group(2))) + # except (AttributeError, CalledProcessError) as e: + # if type(e) == AttributeError: + # resp = util.bypass_hcaptcha(self.url) + # kwik_text = resp.text + + # if type(e) == CalledProcessError: + # resp = requests.get(self.url, cookies=cookies) + # finally: + # cookies = resp.cookies + # + # loops += 1 + + #post_url = deobfuscated.form["action"] + #token = deobfuscated.input["value"] + + #resp = helpers.post(post_url, headers=self.headers, params={"_token": token}, cookies=cookies, allow_redirects=False) + #stream_url = resp.headers["Location"] + + #logger.debug('Stream URL: %s' % stream_url) + + #return { + # 'stream_url': stream_url, + # 'meta': { + # 'title': title, + # 'thumbnail': '' + # }, + # 'referer': None + #} diff --git a/anime_downloader/sites/animepahe.py b/anime_downloader/sites/animepahe.py index 97ddb6b..9f09cb0 100644 --- a/anime_downloader/sites/animepahe.py +++ b/anime_downloader/sites/animepahe.py @@ -74,7 +74,7 @@ class AnimePahe(Anime, sitename='animepahe'): for search_result in search_results['data']: search_result_info = SearchResult( title=search_result['title'], - url=cls.base_anime_url + search_result['slug'], + url=cls.base_anime_url + search_result['session'], poster=search_result['poster'] ) diff --git a/anime_downloader/sites/init.py b/anime_downloader/sites/init.py index 054d83b..0e8d2c8 100644 --- a/anime_downloader/sites/init.py +++ b/anime_downloader/sites/init.py @@ -18,6 +18,7 @@ ALL_ANIME_SITES = [ ('animetake','animetake','AnimeTake'), ('animeonline','animeonline360','AnimeOnline'), ('animeout', 'animeout', 'AnimeOut'), + ('animepahe', 'animepahe', 'AnimePahe'), ('animerush', 'animerush', 'AnimeRush'), ('animesimple', 'animesimple', 'AnimeSimple'), ('animesuge', 'animesuge', 'AnimeSuge'), From fd7599e8629beff0c304e04aad58b39e865b08b7 Mon Sep 17 00:00:00 2001 From: nate-moo <40650681+nate-moo@users.noreply.github.com> Date: Sun, 16 May 2021 01:01:30 -0400 Subject: [PATCH 2/8] autopep8 --- anime_downloader/extractors/kwik.py | 44 +++++++++++++---------------- anime_downloader/sites/animepahe.py | 6 ++-- 2 files changed, 23 insertions(+), 27 deletions(-) diff --git a/anime_downloader/extractors/kwik.py b/anime_downloader/extractors/kwik.py index 0e157e1..54c2180 100644 --- a/anime_downloader/extractors/kwik.py +++ b/anime_downloader/extractors/kwik.py @@ -29,17 +29,15 @@ class Kwik(BaseExtractor): # have to rebuild the url. Hopefully kwik doesn't block this too # Necessary - #ld(self.url) + # ld(self.url) #self.url = self.url.replace(".cx/e/", ".cx/f/") #self.headers.update({"referer": self.url}) headers = {"Referer": "https://kwik.cx/"} - - res = requests.get(self.url, headers=headers) - #ld(res.text) + # ld(res.text) evalText = helpers.soupify(res.text) @@ -57,18 +55,19 @@ class Kwik(BaseExtractor): with open(tf, 'w', encoding="utf-8") as f: f.write(rexd) - - #print(tf) - #ld(nodeRes) - + # print(tf) + + # ld(nodeRes) + nodeRes = str(subprocess.getoutput(f"node {tf}")) ld(nodeRes) - stream_url = re.search(r"source='([^;]*)';", nodeRes).group().replace("source='", "").replace("';", "") + stream_url = re.search( + r"source='([^;]*)';", nodeRes).group().replace("source='", "").replace("';", "") #reg = re.compile("[\s\S]*") - + ld(stream_url) #kwik_text = resp.text @@ -78,30 +77,25 @@ class Kwik(BaseExtractor): return { 'stream_url': stream_url, -# 'meta': { -# 'title': title, -# 'thumbnail': '' -# }, + # 'meta': { + # 'title': title, + # 'thumbnail': '' + # }, 'referer': "https://kwik.cx/" } - - - #cookies = util.get_hcaptcha_cookies(self.url) - #if not cookies: + # if not cookies: # resp = util.bypass_hcaptcha(self.url) - #else: + # else: # resp = requests.get(self.url, cookies=cookies) - - # #deobfuscated = None #loops = 0 - #while not deobfuscated and loops < 6: + # while not deobfuscated and loops < 6: # try: # deobfuscated = helpers.soupify(util.deobfuscate_packed_js(re.search(r'<(script).*(var\s+_.*escape.*?)\1>(?s)', kwik_text).group(2))) # except (AttributeError, CalledProcessError) as e: @@ -113,7 +107,7 @@ class Kwik(BaseExtractor): # resp = requests.get(self.url, cookies=cookies) # finally: # cookies = resp.cookies - # + # # loops += 1 #post_url = deobfuscated.form["action"] @@ -124,11 +118,11 @@ class Kwik(BaseExtractor): #logger.debug('Stream URL: %s' % stream_url) - #return { + # return { # 'stream_url': stream_url, # 'meta': { # 'title': title, # 'thumbnail': '' # }, # 'referer': None - #} + # } diff --git a/anime_downloader/sites/animepahe.py b/anime_downloader/sites/animepahe.py index 9f09cb0..8db5992 100644 --- a/anime_downloader/sites/animepahe.py +++ b/anime_downloader/sites/animepahe.py @@ -21,7 +21,8 @@ class AnimePaheEpisode(AnimeEpisode, sitename='animepahe'): 'session': session_id } - episode_data = helpers.get('https://animepahe.com/api', params=params).json() + episode_data = helpers.get( + 'https://animepahe.com/api', params=params).json() episode_data = episode_data['data'] sources = {} @@ -39,7 +40,8 @@ class AnimePaheEpisode(AnimeEpisode, sitename='animepahe'): sources = [] server_list = re.findall(r'data-provider="([^"]+)', source_text) - episode_id, session_id = re.search("getUrls\((\d+?), \"(.*)?\"", source_text).groups() + episode_id, session_id = re.search( + "getUrls\((\d+?), \"(.*)?\"", source_text).groups() for server in server_list: if server not in supported_servers: From 52d768e6c2365091e66d4d008313aac5301b44f8 Mon Sep 17 00:00:00 2001 From: nate-moo <40650681+nate-moo@users.noreply.github.com> Date: Sun, 16 May 2021 01:04:01 -0400 Subject: [PATCH 3/8] removing commented out code --- anime_downloader/extractors/kwik.py | 63 ----------------------------- 1 file changed, 63 deletions(-) diff --git a/anime_downloader/extractors/kwik.py b/anime_downloader/extractors/kwik.py index 54c2180..3cb93f9 100644 --- a/anime_downloader/extractors/kwik.py +++ b/anime_downloader/extractors/kwik.py @@ -29,16 +29,11 @@ class Kwik(BaseExtractor): # have to rebuild the url. Hopefully kwik doesn't block this too # Necessary - # ld(self.url) - #self.url = self.url.replace(".cx/e/", ".cx/f/") - #self.headers.update({"referer": self.url}) headers = {"Referer": "https://kwik.cx/"} res = requests.get(self.url, headers=headers) - # ld(res.text) - evalText = helpers.soupify(res.text) scripts = evalText.select("script") @@ -55,74 +50,16 @@ class Kwik(BaseExtractor): with open(tf, 'w', encoding="utf-8") as f: f.write(rexd) - - # print(tf) - - # ld(nodeRes) - nodeRes = str(subprocess.getoutput(f"node {tf}")) ld(nodeRes) stream_url = re.search( r"source='([^;]*)';", nodeRes).group().replace("source='", "").replace("';", "") - #reg = re.compile("[\s\S]*") ld(stream_url) - #kwik_text = resp.text - - #title_re = re.compile(r'title>(.*)<') - #title = title_re.search(kwik_text).group(1) - return { 'stream_url': stream_url, - # 'meta': { - # 'title': title, - # 'thumbnail': '' - # }, 'referer': "https://kwik.cx/" } - - #cookies = util.get_hcaptcha_cookies(self.url) - - # if not cookies: - # resp = util.bypass_hcaptcha(self.url) - # else: - # resp = requests.get(self.url, cookies=cookies) - - # - #deobfuscated = None - - #loops = 0 - # while not deobfuscated and loops < 6: - # try: - # deobfuscated = helpers.soupify(util.deobfuscate_packed_js(re.search(r'<(script).*(var\s+_.*escape.*?)\1>(?s)', kwik_text).group(2))) - # except (AttributeError, CalledProcessError) as e: - # if type(e) == AttributeError: - # resp = util.bypass_hcaptcha(self.url) - # kwik_text = resp.text - - # if type(e) == CalledProcessError: - # resp = requests.get(self.url, cookies=cookies) - # finally: - # cookies = resp.cookies - # - # loops += 1 - - #post_url = deobfuscated.form["action"] - #token = deobfuscated.input["value"] - - #resp = helpers.post(post_url, headers=self.headers, params={"_token": token}, cookies=cookies, allow_redirects=False) - #stream_url = resp.headers["Location"] - - #logger.debug('Stream URL: %s' % stream_url) - - # return { - # 'stream_url': stream_url, - # 'meta': { - # 'title': title, - # 'thumbnail': '' - # }, - # 'referer': None - # } From 4710e0fddf68cb8450d55bd0b5dfdf7952759233 Mon Sep 17 00:00:00 2001 From: Arjix <53124886+ArjixGamer@users.noreply.github.com> Date: Sun, 16 May 2021 20:14:34 +0300 Subject: [PATCH 4/8] optimized animepahe I completely changed the way the episodes are scraped. But as a downside only the kwik server is used. --- anime_downloader/sites/animepahe.py | 179 ++++++++++++---------------- 1 file changed, 74 insertions(+), 105 deletions(-) diff --git a/anime_downloader/sites/animepahe.py b/anime_downloader/sites/animepahe.py index 8db5992..ea73981 100644 --- a/anime_downloader/sites/animepahe.py +++ b/anime_downloader/sites/animepahe.py @@ -8,59 +8,9 @@ from anime_downloader.sites import helpers logger = logging.getLogger(__name__) -class AnimePaheEpisode(AnimeEpisode, sitename='animepahe'): - QUALITIES = ['360p', '480p', '720p', '1080p'] - - def _get_source(self, episode_id, server, session_id): - # We will extract the episodes data through the animepahe api - # which returns the available qualities and the episode sources. - params = { - 'id': episode_id, - 'm': 'embed', - 'p': server, - 'session': session_id - } - - episode_data = helpers.get( - 'https://animepahe.com/api', params=params).json() - episode_data = episode_data['data'] - sources = {} - - for info in range(len(episode_data)): - quality = list(episode_data[info].keys())[0] - sources[f'{quality}p'] = episode_data[info][quality]['kwik'] - - if self.quality in sources: - return (server, sources[self.quality]) - return - - def _get_sources(self): - supported_servers = ['kwik', 'mp4upload', 'rapidvideo'] - source_text = helpers.get(self.url, cf=True).text - sources = [] - - server_list = re.findall(r'data-provider="([^"]+)', source_text) - episode_id, session_id = re.search( - "getUrls\((\d+?), \"(.*)?\"", source_text).groups() - - for server in server_list: - if server not in supported_servers: - continue - source = self._get_source(episode_id, server, session_id) - if source: - sources.append(source) - - if sources: - return sources - raise NotFoundError - - class AnimePahe(Anime, sitename='animepahe'): sitename = 'animepahe' api_url = 'https://animepahe.com/api' - base_anime_url = 'https://animepahe.com/anime/' - QUALITIES = ['360p', '480p', '720p', '1080p'] - _episodeClass = AnimePaheEpisode @classmethod def search(cls, query): @@ -71,68 +21,87 @@ class AnimePahe(Anime, sitename='animepahe'): } search_results = helpers.get(cls.api_url, params=params).json() - results = [] + if search_results['total'] == []: + return [] - for search_result in search_results['data']: - search_result_info = SearchResult( - title=search_result['title'], - url=cls.base_anime_url + search_result['session'], - poster=search_result['poster'] + return [ + SearchResult( + title=result['title'] + " (" + result['type'] + ")", + url="https://animepahe.com/anime/" + result['session'] + "/" + str(result['id']), # noqa + poster=result['poster'] ) + for result in search_results['data'] + ] - logger.debug(search_result_info) - results.append(search_result_info) + def _scrape_episodes(self): + attr = self.url.split('/') + session = attr[-2] + id_ = attr[-1] + page = 1 + headers = {'referer': 'https://animepahe.com/'} - return results + apiUri = self.api_url + '?m=release&id=' + id_ + '&sort=episode_asc&page=' + jsonResponse = helpers.get(apiUri + str(page), headers=headers).json() + lastPage = jsonResponse['last_page'] + perPage = jsonResponse['per_page'] + total = jsonResponse['total'] + ep = 1 + episodes = [] - def get_data(self): - page = helpers.get(self.url, cf=True).text - anime_id = re.search(r'&id=(\d+)', page).group(1) - - self.params = { - 'm': 'release', - 'id': anime_id, - 'sort': 'episode_asc', - 'page': 1 - } - - json_resp = helpers.get(self.api_url, params=self.params).json() - self._scrape_metadata(page) - self._episode_urls = self._scrape_episodes(json_resp) - self._len = len(self._episode_urls) - return self._episode_urls - - def _collect_episodes(self, ani_json, episodes=[]): - # Avoid changing original list - episodes = episodes[:] - - # If episodes is not an empty list we ensure that we start off - # from the length of the episodes list to get correct episode - # numbers - for no, anime_ep in enumerate(ani_json, len(episodes)): - episodes.append((no + 1, f'{self.url}/{anime_ep["id"]}',)) - - return episodes - - def _scrape_episodes(self, ani_json): - episodes = self._collect_episodes(ani_json['data']) - - if not episodes: - raise NotFoundError(f'No episodes found for {self.url}') + if (lastPage == 1 and perPage > total): + for epi in jsonResponse['data']: + episodes.append( + f'{self.api_url}?m=links&id={epi["anime_id"]}&session={epi["session"]}&p=kwik!!TRUE!!') else: - # Check if other pages exist since animepahe only loads - # first page and make subsequent calls to the api for every - # page - start_page = ani_json['current_page'] + 1 - end_page = ani_json['last_page'] + 1 - - for i in range(start_page, end_page): - self.params['page'] = i - resp = helpers.get(self.api_url, params=self.params).json() - - episodes = self._collect_episodes(resp['data'], episodes) - + stop = False + for page in range(lastPage): + if stop: + break + for i in range(perPage): + if ep <= total: + episodes.append( + f'{self.api_url}?m=release&id={id_}&sort=episode_asc&page={page+1}&ep={ep}!!FALSE!!') + ep += 1 + else: + stop = True + break return episodes def _scrape_metadata(self, data): self.title = re.search(r'