2020-07-21 12:28:34 +08:00
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
from __future__ import absolute_import
|
|
|
|
|
import io
|
|
|
|
|
import os
|
|
|
|
|
import logging
|
|
|
|
|
from urllib.parse import unquote
|
|
|
|
|
|
|
|
|
|
from zipfile import ZipFile, is_zipfile
|
|
|
|
|
from rarfile import RarFile, is_rarfile
|
|
|
|
|
|
|
|
|
|
from guessit import guessit
|
2021-01-17 21:58:01 +08:00
|
|
|
|
from subliminal_patch.http import RetryingCFSession
|
2020-07-21 12:28:34 +08:00
|
|
|
|
import chardet
|
|
|
|
|
from bs4 import NavigableString, UnicodeDammit
|
|
|
|
|
from subzero.language import Language
|
|
|
|
|
|
|
|
|
|
from subliminal_patch.providers import Provider
|
|
|
|
|
from subliminal_patch.providers.mixins import ProviderSubtitleArchiveMixin
|
2021-06-11 10:55:58 +08:00
|
|
|
|
from subliminal_patch.subtitle import Subtitle, guess_matches
|
2020-07-21 12:28:34 +08:00
|
|
|
|
from subliminal_patch.score import get_scores, framerate_equal
|
|
|
|
|
from subliminal.providers import ParserBeautifulSoup
|
2021-06-11 10:55:58 +08:00
|
|
|
|
from subliminal.subtitle import sanitize, SUBTITLE_EXTENSIONS
|
2020-07-21 12:28:34 +08:00
|
|
|
|
from subliminal.video import Episode, Movie
|
|
|
|
|
from .utils import FIRST_THOUSAND_OR_SO_USER_AGENTS as AGENT_LIST
|
|
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class SoustitreseuSubtitle(Subtitle):
|
|
|
|
|
"""Sous-Titres.eu Subtitle."""
|
|
|
|
|
provider_name = 'soustitreseu'
|
|
|
|
|
|
|
|
|
|
def __init__(self, language, video, name, data, content, is_perfect_match):
|
|
|
|
|
self.language = language
|
|
|
|
|
self.srt_filename = name
|
|
|
|
|
self.release_info = name
|
|
|
|
|
self.page_link = None
|
|
|
|
|
self.download_link = None
|
|
|
|
|
self.data = data
|
|
|
|
|
self.video = video
|
|
|
|
|
self.matches = None
|
|
|
|
|
self.content = content
|
|
|
|
|
self.hearing_impaired = None
|
|
|
|
|
self.is_perfect_match = is_perfect_match
|
2020-07-22 10:45:23 +08:00
|
|
|
|
self._guessed_encoding = None
|
2020-07-21 12:28:34 +08:00
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def id(self):
|
|
|
|
|
return self.srt_filename
|
|
|
|
|
|
|
|
|
|
def get_matches(self, video):
|
|
|
|
|
matches = set()
|
|
|
|
|
|
|
|
|
|
if self.is_perfect_match:
|
|
|
|
|
if isinstance(video, Episode):
|
|
|
|
|
matches.add('series')
|
|
|
|
|
else:
|
|
|
|
|
matches.add('title')
|
|
|
|
|
|
|
|
|
|
# guess additional info from data
|
|
|
|
|
matches |= guess_matches(video, self.data)
|
|
|
|
|
|
|
|
|
|
self.matches = matches
|
|
|
|
|
self.data = None # removing this make the subtitles object unpickable
|
|
|
|
|
return matches
|
|
|
|
|
|
|
|
|
|
def guess_encoding(self):
|
|
|
|
|
# override default subtitle guess_encoding method to not include language-specific encodings guessing
|
|
|
|
|
# chardet encoding detection seem to yield better results
|
|
|
|
|
"""Guess encoding using chardet.
|
|
|
|
|
|
|
|
|
|
:return: the guessed encoding.
|
|
|
|
|
:rtype: str
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
if self._guessed_encoding:
|
|
|
|
|
return self._guessed_encoding
|
|
|
|
|
|
|
|
|
|
logger.info('Guessing encoding for language %s', self.language)
|
|
|
|
|
|
|
|
|
|
# guess/detect encoding using chardet
|
|
|
|
|
encoding = chardet.detect(self.content)['encoding']
|
|
|
|
|
logger.info('Chardet found encoding %s', encoding)
|
|
|
|
|
|
|
|
|
|
if not encoding:
|
|
|
|
|
# fallback on bs4
|
|
|
|
|
logger.info('Falling back to bs4 detection')
|
|
|
|
|
a = UnicodeDammit(self.content)
|
|
|
|
|
|
|
|
|
|
logger.info("bs4 detected encoding: %s", a.original_encoding)
|
|
|
|
|
|
|
|
|
|
if a.original_encoding:
|
|
|
|
|
self._guessed_encoding = a.original_encoding
|
|
|
|
|
return a.original_encoding
|
|
|
|
|
raise ValueError(u"Couldn't guess the proper encoding for %s", self)
|
|
|
|
|
|
|
|
|
|
self._guessed_encoding = encoding
|
|
|
|
|
return encoding
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class SoustitreseuProvider(Provider, ProviderSubtitleArchiveMixin):
|
|
|
|
|
"""Sous-Titres.eu Provider."""
|
|
|
|
|
subtitle_class = SoustitreseuSubtitle
|
|
|
|
|
languages = {Language(l) for l in ['fra', 'eng']}
|
2021-11-10 11:55:47 +08:00
|
|
|
|
video_types = (Episode, Movie)
|
2020-07-21 12:28:34 +08:00
|
|
|
|
server_url = 'https://www.sous-titres.eu/'
|
|
|
|
|
search_url = server_url + 'search.html'
|
|
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
|
self.session = None
|
|
|
|
|
self.is_perfect_match = False
|
|
|
|
|
|
|
|
|
|
def initialize(self):
|
2021-01-17 21:58:01 +08:00
|
|
|
|
self.session = RetryingCFSession()
|
2020-07-21 12:28:34 +08:00
|
|
|
|
self.session.headers['Referer'] = self.server_url
|
|
|
|
|
|
|
|
|
|
def terminate(self):
|
|
|
|
|
self.session.close()
|
|
|
|
|
|
|
|
|
|
def query_series(self, video, title):
|
|
|
|
|
subtitles = []
|
|
|
|
|
|
2021-01-17 21:58:01 +08:00
|
|
|
|
r = self.session.get(self.search_url, params={'q': title}, timeout=30)
|
2020-07-21 12:28:34 +08:00
|
|
|
|
r.raise_for_status()
|
|
|
|
|
|
|
|
|
|
soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['html.parser'])
|
|
|
|
|
|
|
|
|
|
# loop over series name
|
|
|
|
|
self.is_perfect_match = False
|
|
|
|
|
series_url = []
|
|
|
|
|
series = soup.select('.serie > h3 > a')
|
|
|
|
|
for item in series:
|
|
|
|
|
# title
|
|
|
|
|
if title in item.text:
|
|
|
|
|
series_url.append(item.attrs['href'])
|
|
|
|
|
self.is_perfect_match = True
|
|
|
|
|
|
|
|
|
|
series_subs_archives_url = []
|
|
|
|
|
for series_page in series_url:
|
|
|
|
|
page_link = self.server_url + series_page
|
2021-01-17 21:58:01 +08:00
|
|
|
|
r = self.session.get(page_link, timeout=30)
|
2020-07-21 12:28:34 +08:00
|
|
|
|
r.raise_for_status()
|
|
|
|
|
|
|
|
|
|
soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['html.parser'])
|
|
|
|
|
|
|
|
|
|
series_subs_archives = soup.select('a.subList')
|
|
|
|
|
for item in series_subs_archives:
|
|
|
|
|
matching_archive = False
|
|
|
|
|
subtitles_archive_name = unquote(item.attrs['href'].split('/')[-1:][0][:-4])
|
|
|
|
|
guessed_subs = guessit(subtitles_archive_name, {'type': 'episode'})
|
|
|
|
|
try:
|
|
|
|
|
season, episode = item.select_one('.episodenum').text.split('×')
|
|
|
|
|
guessed_subs.update({'season': int(season), 'episode': int(episode)})
|
|
|
|
|
except ValueError:
|
|
|
|
|
season = item.select_one('.episodenum').text[1:]
|
|
|
|
|
episode = None
|
|
|
|
|
guessed_subs.update({'season': int(season)})
|
|
|
|
|
|
|
|
|
|
if guessed_subs['season'] == video.season:
|
|
|
|
|
if 'episode' in guessed_subs:
|
|
|
|
|
if guessed_subs['episode'] == video.episode:
|
|
|
|
|
matching_archive = True
|
|
|
|
|
else:
|
|
|
|
|
matching_archive = True
|
|
|
|
|
|
|
|
|
|
if matching_archive:
|
|
|
|
|
download_link = self.server_url + 'series/' + item.attrs['href']
|
2021-01-17 21:58:01 +08:00
|
|
|
|
res = self.session.get(download_link, timeout=30)
|
2020-07-21 12:28:34 +08:00
|
|
|
|
res.raise_for_status()
|
|
|
|
|
|
|
|
|
|
archive = self._get_archive(res.content)
|
|
|
|
|
# extract the subtitle
|
|
|
|
|
if archive:
|
|
|
|
|
subtitles_from_archive = self._get_subtitle_from_archive(archive, video)
|
|
|
|
|
for subtitle in subtitles_from_archive:
|
|
|
|
|
subtitle.page_link = page_link
|
|
|
|
|
subtitle.download_link = download_link
|
|
|
|
|
subtitles.append(subtitle)
|
|
|
|
|
|
|
|
|
|
return subtitles
|
|
|
|
|
|
|
|
|
|
def query_movies(self, video, title):
|
|
|
|
|
subtitles = []
|
|
|
|
|
|
2021-01-17 21:58:01 +08:00
|
|
|
|
r = self.session.get(self.search_url, params={'q': title}, timeout=30)
|
2020-07-21 12:28:34 +08:00
|
|
|
|
r.raise_for_status()
|
|
|
|
|
|
|
|
|
|
soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['html.parser'])
|
|
|
|
|
|
|
|
|
|
# loop over movies name
|
|
|
|
|
movies_url = []
|
|
|
|
|
self.is_perfect_match = False
|
|
|
|
|
movies = soup.select('.film > h3 > a')
|
|
|
|
|
for item in movies:
|
|
|
|
|
# title
|
|
|
|
|
if title.lower() in item.text.lower():
|
|
|
|
|
movies_url.append(item.attrs['href'])
|
|
|
|
|
self.is_perfect_match = True
|
|
|
|
|
|
|
|
|
|
series_subs_archives_url = []
|
|
|
|
|
for movies_page in movies_url:
|
|
|
|
|
page_link = self.server_url + movies_page
|
2021-01-17 21:58:01 +08:00
|
|
|
|
r = self.session.get(page_link, timeout=30)
|
2020-07-21 12:28:34 +08:00
|
|
|
|
r.raise_for_status()
|
|
|
|
|
|
|
|
|
|
soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['html.parser'])
|
|
|
|
|
|
|
|
|
|
movies_subs_archives = soup.select('a.subList')
|
|
|
|
|
for item in movies_subs_archives:
|
|
|
|
|
download_link = self.server_url + 'films/' + item.attrs['href']
|
2021-01-17 21:58:01 +08:00
|
|
|
|
res = self.session.get(download_link, timeout=30)
|
2020-07-21 12:28:34 +08:00
|
|
|
|
res.raise_for_status()
|
|
|
|
|
|
|
|
|
|
archive = self._get_archive(res.content)
|
|
|
|
|
# extract the subtitle
|
|
|
|
|
if archive:
|
|
|
|
|
subtitles_from_archive = self._get_subtitle_from_archive(archive, video)
|
|
|
|
|
for subtitle in subtitles_from_archive:
|
|
|
|
|
subtitle.page_link = page_link
|
|
|
|
|
subtitle.download_link = download_link
|
|
|
|
|
subtitles.append(subtitle)
|
|
|
|
|
|
|
|
|
|
return subtitles
|
|
|
|
|
|
|
|
|
|
def list_subtitles(self, video, languages):
|
|
|
|
|
subtitles = []
|
2020-07-22 10:45:23 +08:00
|
|
|
|
|
2020-07-21 12:28:34 +08:00
|
|
|
|
# query for subtitles
|
2020-07-22 10:45:23 +08:00
|
|
|
|
if isinstance(video, Episode):
|
|
|
|
|
subtitles += [s for s in self.query_series(video, video.series) if s.language in languages]
|
|
|
|
|
else:
|
|
|
|
|
subtitles += [s for s in self.query_movies(video, video.title) if s.language in languages]
|
2020-07-21 12:28:34 +08:00
|
|
|
|
|
|
|
|
|
return subtitles
|
|
|
|
|
|
|
|
|
|
def download_subtitle(self, subtitle):
|
|
|
|
|
return subtitle
|
|
|
|
|
|
|
|
|
|
def _get_archive(self, content):
|
|
|
|
|
# open the archive
|
|
|
|
|
archive_stream = io.BytesIO(content)
|
|
|
|
|
if is_rarfile(archive_stream):
|
|
|
|
|
logger.debug('Sous-Titres.eu: Identified rar archive')
|
|
|
|
|
archive = RarFile(archive_stream)
|
|
|
|
|
elif is_zipfile(archive_stream):
|
|
|
|
|
logger.debug('Sous-Titres.eu: Identified zip archive')
|
|
|
|
|
archive = ZipFile(archive_stream)
|
|
|
|
|
else:
|
|
|
|
|
logger.error('Sous-Titres.eu: Unsupported compressed format')
|
|
|
|
|
return None
|
|
|
|
|
return archive
|
|
|
|
|
|
|
|
|
|
def _get_subtitle_from_archive(self, archive, video):
|
|
|
|
|
subtitles = []
|
|
|
|
|
|
|
|
|
|
# some files have a non subtitle with .txt extension
|
|
|
|
|
_tmp = list(SUBTITLE_EXTENSIONS)
|
|
|
|
|
_tmp.remove('.txt')
|
|
|
|
|
_subtitle_extensions = tuple(_tmp)
|
|
|
|
|
_scores = get_scores(video)
|
|
|
|
|
|
|
|
|
|
for name in archive.namelist():
|
|
|
|
|
# discard hidden files
|
|
|
|
|
if os.path.split(name)[-1].startswith('.'):
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# discard non-subtitle files
|
|
|
|
|
if not name.lower().endswith(_subtitle_extensions):
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# get subtitles language
|
|
|
|
|
if '.en.' in name.lower():
|
|
|
|
|
language = Language.fromopensubtitles('eng')
|
|
|
|
|
else:
|
|
|
|
|
language = Language.fromopensubtitles('fre')
|
|
|
|
|
|
|
|
|
|
release = name[:-4].lower().rstrip('tag').rstrip('en').rstrip('fr')
|
|
|
|
|
_guess = guessit(release)
|
|
|
|
|
if isinstance(video, Episode):
|
|
|
|
|
if video.episode != _guess['episode'] or video.season != _guess['season']:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
matches = set()
|
|
|
|
|
matches |= guess_matches(video, _guess)
|
|
|
|
|
_score = sum((_scores.get(match, 0) for match in matches))
|
|
|
|
|
content = archive.read(name)
|
|
|
|
|
subtitles.append(SoustitreseuSubtitle(language, video, name, _guess, content, self.is_perfect_match))
|
|
|
|
|
|
|
|
|
|
return subtitles
|