telegram_media_downloader/media_downloader.py

363 lines
10 KiB
Python
Raw Normal View History

2019-11-06 20:07:25 +08:00
"""Downloads media from telegram."""
2019-07-24 23:42:25 +08:00
import os
import logging
2020-06-11 22:21:12 +08:00
from typing import List, Tuple, Optional
from datetime import datetime as dt
2019-07-24 23:42:25 +08:00
2020-05-25 20:28:10 +08:00
import asyncio
2019-11-06 20:07:25 +08:00
import pyrogram
2020-07-11 05:38:57 +08:00
import yaml
2020-07-22 01:36:03 +08:00
from utils.file_management import get_next_name, manage_duplicate_file
2020-12-31 21:27:41 +08:00
from utils.log import LogFilter
2020-12-30 19:15:27 +08:00
from utils.meta import print_meta
2019-07-24 23:42:25 +08:00
2020-12-31 21:27:41 +08:00
2019-07-24 23:42:25 +08:00
logging.basicConfig(level=logging.INFO)
2020-12-31 21:27:41 +08:00
logging.getLogger("pyrogram.session.session").addFilter(LogFilter())
logging.getLogger("pyrogram.client").addFilter(LogFilter())
logger = logging.getLogger("media_downloader")
2019-07-24 23:42:25 +08:00
THIS_DIR = os.path.dirname(os.path.abspath(__file__))
FAILED_IDS: list = []
2019-07-24 23:42:25 +08:00
def update_config(config: dict):
2020-12-13 07:31:01 +08:00
"""
Update exisitng configuration file.
Parameters
----------
2020-12-13 00:05:44 +08:00
config: dict
Configuraiton to be written into config file.
"""
config["ids_to_retry"] = list(set(config["ids_to_retry"] + FAILED_IDS))
with open("config.yaml", "w") as yaml_file:
yaml.dump(config, yaml_file, default_flow_style=False)
logger.info("Updated last read message_id to config file")
2019-07-24 23:42:25 +08:00
2020-07-22 02:05:26 +08:00
def _can_download(
_type: str, file_formats: dict, file_format: Optional[str]
) -> bool:
2020-12-13 00:05:44 +08:00
"""
Check if the given file format can be downloaded.
Parameters
----------
_type: str
Type of media object.
file_formats: dict
Dictionary containing the list of file_formats
to be downloaded for `audio`, `document` & `video`
media types
file_format: str
Format of the current file to be downloaded.
Returns
-------
bool
True if the file format can be downloaded else False.
"""
2020-07-22 02:05:26 +08:00
if _type in ["audio", "document", "video"]:
allowed_formats: list = file_formats[_type]
if not file_format in allowed_formats and allowed_formats[0] != "all":
return False
return True
def _is_exist(file_path: str) -> bool:
2020-12-13 00:05:44 +08:00
"""
Check if a file exists and it is not a directory.
Parameters
----------
file_path: str
Absolute path of the file to be checked.
Returns
-------
bool
True if the file exists else False.
"""
2020-07-22 02:05:26 +08:00
return not os.path.isdir(file_path) and os.path.exists(file_path)
async def _get_media_meta(
2020-11-03 00:04:32 +08:00
media_obj: pyrogram.types.messages_and_media, _type: str
2020-06-11 22:21:12 +08:00
) -> Tuple[str, str, Optional[str]]:
2020-12-13 00:05:44 +08:00
"""
Extract file name and file id.
Parameters
----------
2020-11-03 00:04:32 +08:00
media_obj: pyrogram.types.messages_and_media
Media object to be extracted.
2020-12-13 00:05:44 +08:00
_type: str
Type of media object.
Returns
-------
tuple
2020-06-11 22:05:41 +08:00
file_ref, file_name, file_format
"""
2019-11-08 00:55:43 +08:00
file_ref: str = media_obj.file_ref
2020-06-11 22:05:41 +08:00
if _type in ["audio", "document", "video"]:
2020-06-11 22:21:12 +08:00
file_format: Optional[str] = media_obj.mime_type.split("/")[-1]
2020-06-11 22:05:41 +08:00
else:
file_format = None
if _type == "voice":
file_format = media_obj.mime_type.split("/")[-1]
file_name: str = os.path.join(
THIS_DIR,
_type,
"voice_{}.{}".format(
2019-08-06 22:30:13 +08:00
dt.utcfromtimestamp(media_obj.date).isoformat(), file_format
),
)
else:
2020-07-15 18:44:46 +08:00
file_name = os.path.join(
THIS_DIR, _type, getattr(media_obj, "file_name", None) or ""
)
2020-06-11 22:05:41 +08:00
return file_ref, file_name, file_format
2020-06-11 22:05:41 +08:00
async def download_media(
2020-11-03 00:04:32 +08:00
client: pyrogram.client.Client,
message: pyrogram.types.Message,
2020-06-11 22:05:41 +08:00
media_types: List[str],
file_formats: dict,
):
2020-12-13 00:05:44 +08:00
"""
Download media from Telegram.
Each of the files to download are retried 3 times with a
delay of 5 seconds each.
Parameters
----------
2020-11-03 00:04:32 +08:00
client: pyrogram.client.Client
Client to interact with Telegram APIs.
2020-11-03 00:04:32 +08:00
message: pyrogram.types.Message
Message object retrived from telegram.
media_types: list
List of strings of media types to be downloaded.
Ex : `["audio", "photo"]`
Supported formats:
* audio
* document
* photo
* video
* voice
2020-06-11 22:05:41 +08:00
file_formats: dict
Dictionary containing the list of file_formats
to be downloaded for `audio`, `document` & `video`
2020-12-13 00:05:44 +08:00
media types.
Returns
-------
2020-12-13 00:05:44 +08:00
int
Current message id.
"""
for retry in range(3):
try:
if message.media is None:
return message.message_id
for _type in media_types:
_media = getattr(message, _type, None)
if _media is None:
continue
2020-06-11 22:05:41 +08:00
file_ref, file_name, file_format = await _get_media_meta(
_media, _type
)
2020-06-11 22:05:41 +08:00
if _can_download(_type, file_formats, file_format):
2020-07-22 01:36:03 +08:00
if _is_exist(file_name):
file_name = get_next_name(file_name)
download_path = await client.download_media(
message, file_ref=file_ref, file_name=file_name
)
download_path = manage_duplicate_file(download_path)
else:
download_path = await client.download_media(
message, file_ref=file_ref, file_name=file_name
)
if download_path:
logger.info("Media downloaded - %s", download_path)
break
except pyrogram.errors.exceptions.bad_request_400.BadRequest:
logger.warning(
"Message[%d]: file reference expired, refetching...",
message.message_id,
)
message = await client.get_messages(
chat_id=message.chat.id,
message_ids=message.message_id,
)
if retry == 2:
# pylint: disable = C0301
logger.error(
"Message[%d]: file reference expired for 3 retries, download skipped.",
message.message_id,
)
FAILED_IDS.append(message.message_id)
except TypeError:
# pylint: disable = C0301
logger.warning(
"Timeout Error occured when downloading Message[%d], retrying after 5 seconds",
message.message_id,
)
await asyncio.sleep(5)
if retry == 2:
logger.error(
"Message[%d]: Timing out after 3 reties, download skipped.",
message.message_id,
)
FAILED_IDS.append(message.message_id)
except Exception as e:
# pylint: disable = C0301
logger.error(
"Message[%d]: could not be downloaded due to following exception:\n[%s].",
message.message_id,
e,
exc_info=True,
)
FAILED_IDS.append(message.message_id)
break
return message.message_id
async def process_messages(
2020-11-03 00:04:32 +08:00
client: pyrogram.client.Client,
messages: List[pyrogram.types.Message],
media_types: List[str],
2020-06-11 22:05:41 +08:00
file_formats: dict,
) -> int:
2020-12-13 00:05:44 +08:00
"""
Download media from Telegram.
Parameters
----------
2020-11-03 00:04:32 +08:00
client: pyrogram.client.Client
Client to interact with Telegram APIs.
2020-07-10 05:55:28 +08:00
messages: list
List of telegram messages.
media_types: list
List of strings of media types to be downloaded.
Ex : `["audio", "photo"]`
Supported formats:
* audio
* document
* photo
* video
* voice
2020-06-11 22:05:41 +08:00
file_formats: dict
Dictionary containing the list of file_formats
to be downloaded for `audio`, `document` & `video`
2020-12-13 00:05:44 +08:00
media types.
Returns
-------
2020-12-13 00:05:44 +08:00
int
Max value of list of message ids.
"""
message_ids = await asyncio.gather(
*[
2020-06-11 22:05:41 +08:00
download_media(client, message, media_types, file_formats)
2020-07-10 05:55:28 +08:00
for message in messages
]
2019-07-24 23:42:25 +08:00
)
2020-07-10 05:55:28 +08:00
last_message_id = max(message_ids)
return last_message_id
2019-07-24 23:42:25 +08:00
2020-12-13 00:05:44 +08:00
async def begin_import(config: dict, pagination_limit: int) -> dict:
"""
Create pyrogram client and initiate download.
The pyrogram client is created using the ``api_id``, ``api_hash``
from the config and iter throught message offset on the
``last_message_id`` and the requested file_formats.
Parameters
----------
config: dict
Dict containing the config to create pyrogram client.
pagination_limit: int
Number of message to download asynchronously as a batch.
Returns
-------
dict
Updated configuraiton to be written into config file.
"""
2019-11-06 20:07:25 +08:00
client = pyrogram.Client(
"media_downloader",
api_id=config["api_id"],
api_hash=config["api_hash"],
2019-07-24 23:42:25 +08:00
)
2020-12-30 19:15:27 +08:00
pyrogram.session.Session.notice_displayed = True
await client.start()
2020-07-10 05:55:28 +08:00
last_read_message_id: int = config["last_read_message_id"]
messages_iter = client.iter_history(
2020-11-03 00:04:32 +08:00
config["chat_id"],
offset_id=last_read_message_id,
reverse=True,
2019-07-24 23:42:25 +08:00
)
2020-07-10 05:55:28 +08:00
pagination_count: int = 0
messages_list: list = []
async for message in messages_iter:
2020-07-10 20:08:06 +08:00
if pagination_count != pagination_limit:
2020-07-10 05:55:28 +08:00
pagination_count += 1
messages_list.append(message)
else:
last_read_message_id = await process_messages(
client,
messages_list,
config["media_types"],
config["file_formats"],
)
pagination_count = 0
messages_list = []
messages_list.append(message)
2020-12-10 22:05:44 +08:00
config["last_read_message_id"] = last_read_message_id
update_config(config)
2020-07-10 05:55:28 +08:00
if messages_list:
last_read_message_id = await process_messages(
client,
messages_list,
config["media_types"],
config["file_formats"],
)
await client.stop()
2020-07-10 05:55:28 +08:00
config["last_read_message_id"] = last_read_message_id
return config
2019-07-24 23:42:25 +08:00
def main():
"""Main function of the downloader."""
f = open(os.path.join(THIS_DIR, "config.yaml"))
config = yaml.safe_load(f)
f.close()
updated_config = asyncio.get_event_loop().run_until_complete(
2020-07-10 20:08:06 +08:00
begin_import(config, pagination_limit=100)
)
if FAILED_IDS:
logger.info(
"Downloading of %d files failed. "
"Failed message ids are added to config file.\n"
"Functionality to re-download failed downloads will be added "
"in the next version of `Telegram-media-downloader`",
len(set(FAILED_IDS)),
)
update_config(updated_config)
if __name__ == "__main__":
2020-12-30 19:15:27 +08:00
print_meta(logger)
main()