Program for scraping groups in Telegram
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

459 lines
27 KiB

# -*- coding: utf-8 -*-
# https://try.gitea.io/egonalbrecht/mass_ban_telegram or https://git.teknik.io/egonalbrecht/mass_ban_telegram
# Author: Telegram -> @chavespatriota, Chat ID = 1670082376, https://t.me/s/canaisfascistasbr, 03-18-2021.
# Program made in Linux. Edit if necessary. Suggested program for editing: Kate.
# Do not use the name 'telethon' as the name of the program. Use python3. Install all packages that are requested. Use 'pip3 install' for install packages. To run the program on Linux, open the terminal in the folder where the program is and run the command 'python3 backup_channel_telegram.py"'.
# Starting point: https://medium.com/game-of-data/telegram-channel-data-extraction-users-information-chats-and-specific-messages-and-data-21bb54710fd3
# Previous works: https://github.com/egonalbrecht/mass_ban_telegram
##############################
#This variavel that defines whether the backup will include author and date. Select "True" for yes, and "False" for no.
#Enable_Author_and_Date = False
##############################
Enable_Author_and_Date = False #Program with date and author is having problem with synchronization and is still in development. Do not modify here, or try to use a higher wait time to avoid suspension of Telegram by flood (a little longer than 90 seconds should work).
from telethon import TelegramClient, sync
from telethon.sync import TelegramClient
#If you see the message "Security error while unpacking a received message: Server replied with a wrong session ID" or any other strange problem, delete the file "session_..." and log in again.
print('Check the "Backup Monitoring" folder to track progress...')
import datetime
now = datetime.datetime.now()
date_now = now.strftime("%m-%d-%Y at %Hhr %Mmin %Ssec") #On Windows can not use ":" to separate time, as in Linux.
import io #Required to open files on Windowns. On Linux you do not need the 'io' and the 'encoding="utf-8"' to open files. https://stackoverflow.com/questions/27092833/unicodeencodeerror-charmap-codec-cant-encode-characters
#Counts the amount of files monitoring and cleans to prevent accumulation. Storage limit is 100 files.
from subprocess import run #https://pypi.org/project/subprocess.run/, https://stackoverflow.com/questions/18962785/oserror-errno-2-no-such-file-or-directory-while-using-python-subprocess-in-dj
run('num_files=$(ls Backup\ Monitoring/ | wc -l); if [ ${num_files} -gt 100 ] ; then rm Backup\ Monitoring/* ; fi', shell=True) #https://devconnected.com/how-to-count-files-in-directory-on-linux/, https://askubuntu.com/questions/324423/how-to-access-the-last-return-value-in-bash
#Problematic messages that need avoid: poll and games
messages_not_iterable = ['MessageMediaPoll', 'MessageMediaGame']
mon_file = io.open('Backup Monitoring/Progress Monitoring - ' + date_now + '.txt', 'w', encoding="utf-8")
print('Author: Telegram -> @chaveseducado, Chat ID = 1670082376, 03-18-2021.', file=mon_file)
file3_in = io.open("Files and Settings/api_id and api_hash.txt", 'r', encoding="utf-8")
id_hash_temp = file3_in.readlines()
id_hash_temp = [x.strip() for x in id_hash_temp] #Removes the '\n' from all entries in the list.
for i_id_hash_temp in id_hash_temp:
if i_id_hash_temp[0] != '#': #Skip the line with '#' at the beginning.
i_id_hash_temp = str(i_id_hash_temp)
i_id_hash_temp = i_id_hash_temp.replace(" ","")
i_id_hash_temp = i_id_hash_temp.replace("'","")
if i_id_hash_temp[0:7] == 'api_id=':
api_id = int(i_id_hash_temp[7:])
if i_id_hash_temp[0:9] == 'api_hash=':
api_hash = str(i_id_hash_temp[9:])
if i_id_hash_temp[0:13] == 'session_name=':
session_name = str(i_id_hash_temp[13:])
file3_in.close()
print('Your personal data is:', file=mon_file)
print(' api_id =', api_id, file=mon_file)
print(' api_hash =', api_hash, file=mon_file)
print(' session_name = ', session_name, file=mon_file)
try:
client = TelegramClient(session_name, api_id, api_hash).start()
except:
print("You're running the program twice, or, two clients are trying to access the same session. If the problem persists, delete the 'session' file and log in again.", file=mon_file) #https://t.me/TelethonChat/307924
print("You're running the program twice, or, two clients are trying to access the same session. If the problem persists, delete the 'session' file and log in again.")
quit()
f_channel_in = io.open("Files and Settings/Backup channels.txt", 'r', encoding="utf-8")
entitys_temp = f_channel_in.readlines()
entitys_temp = [x.strip() for x in entitys_temp] #Removes the '\n' from all entries in the list.
entitys = []
num_entitys = 0
for i_entitys_temp in entitys_temp:
if i_entitys_temp[0] != '#':
i_entitys_temp = i_entitys_temp.replace(" ","")
if i_entitys_temp[0] == '@':
i_entitys_temp = i_entitys_temp[1:]
i_entitys_temp = i_entitys_temp.split(",")
#Get username from friendly entity
if str(i_entitys_temp[0][0:14]) == 't.me/joinchat/':
i_entitys_temp[0] = i_entitys_temp[0]
elif str(i_entitys_temp[0][0:22]) == 'https://t.me/joinchat/':
i_entitys_temp[0] = i_entitys_temp[0][8:]
elif str(i_entitys_temp[0][0:21]) == 'http://t.me/joinchat/':
i_entitys_temp[0] = i_entitys_temp[0][7:]
elif str(i_entitys_temp[0][0:5]) == 't.me/':
i_entitys_temp[0] = i_entitys_temp[0][5:]
elif str(i_entitys_temp[0][0:13]) == 'https://t.me/':
i_entitys_temp[0] = i_entitys_temp[0][13:]
elif str(i_entitys_temp[0][0:12]) == 'http://t.me/':
i_entitys_temp[0] = i_entitys_temp[0][12:]
entitys.append(i_entitys_temp)
num_entitys += 1
import os
import os.path
#Creates file if it does not exist.
path_IDs = 'Files and Settings/ID Saved Messages/'
for i in range(num_entitys):
name_file_less_slash = entitys[i][0].replace('/','_').replace('.','_') #https://stackoverflow.com/questions/3411771/best-way-to-replace-multiple-characters-in-a-string
if not os.path.exists(path_IDs):
os.makedirs(path_IDs)
f_ID = io.open(path_IDs + name_file_less_slash + ".txt", 'w+', encoding="utf-8")
f_ID.close()
#Creates file if the folder exists but the file does not exist.
for i in range(num_entitys):
name_file_less_slash = entitys[i][0].replace('/','_').replace('.','_')
if not os.path.isfile(path_IDs + name_file_less_slash + ".txt"):
f_ID = io.open(path_IDs + name_file_less_slash + ".txt", 'w+', encoding="utf-8")
f_ID.close()
#Opens file for reading the ID of saved messages.
IDs = [] #Dimension -> number of entities: number of IDs: 2. 2 corresponds to one ID of the source channel and the other of the backup channel. The data with the IDs is of the set type.
ID_last = [] #Dimension -> number of entities.
f_ID = [] #Dimension -> number of entities.
print("Entities to copy and its respective backup entities:", file=mon_file)
for i in range(num_entitys):
IDs_entitys = {}
print(" {}. {} -> {}".format(i+1,entitys[i][0], entitys[i][1]), file=mon_file)
name_file_less_slash = entitys[i][0].replace('/','_').replace('.','_')
f_ID.append(io.open(path_IDs + name_file_less_slash + ".txt", 'r+', encoding="utf-8"))
line = ''
for line in f_ID[i]: #https://stackoverflow.com/a/54278929/5175660
if line != '':
par_ID = line.split(' ')
IDs_entitys[par_ID[0]] = par_ID[1].strip()
pass
IDs.append(IDs_entitys)
if line != '':
ID_last.append(int(line.split(' ')[0]))
else:
ID_last.append(0)
white_space = len(str(i+1))
esp = '{message: <{width}}'.format(message='', width=white_space) #https://stackoverflow.com/a/25293744/5175660
esp += ' '
print(esp + " Last saved message ID: {}".format(ID_last[i]), file=mon_file)
print("-------------------------------------------------------", file=mon_file)
#Constants
wait_time = 3.0 #Suspension by flood generates errors in the program. Try to use the shortest time, but that do not generate suspension by flood. The ideal time seems to be a little more than 80 seconds.
add_time = 10.0
#Declaration
len_author_date = 0 #https://stackoverflow.com/questions/41335752/variable-not-available-to-sub-function
author_date = None
album_caption = ''
id_reply_backup = None
album = None
time_sent = ''
alb_last_id = 0
i_alb = None
from telethon import errors
from time import sleep
from telethon.tl.functions.channels import GetFullChannelRequest
from datetime import datetime
format = "%m/%d/%Y"
format2 = "%m-%d-%Y at %Hhr %Mmin %Ssec"
client.parse_mode = 'html' #Problem with links using markdown formatting, and with HTML formatting only in print. HTML formatting selected for the entire program.
def send_single(id_reply_backup): #https://stackoverflow.com/questions/16576553/python-only-pass-arguments-if-the-variable-exists
global last_id_backup_temp #You only need to do this for unattributed variables that are modified inside the subroutine.
if id_reply_backup is not None:
id_reply_backup = int(id_reply_backup)
if message.media is None: #Text-only message.
if len_author_date + len(message.message) > 4096: #Maximum message size is 4096 characters.
client.send_message(entitys[i][1], message, reply_to=id_reply_backup)
if Enable_Author_and_Date == True:
client.send_message(entitys[i][1], author_date, reply_to=int(last_id_backup_temp[0]))
last_id_backup_temp[0] += 1
print(" Text type message with reply sent" + time_sent, file=mon_file)
else:
print(" Text type message sent" + time_sent, file=mon_file)
else:
message.text += author_date
client.send_message(entitys[i][1], message, reply_to=id_reply_backup)
print(" Text type message sent" + time_sent, file=mon_file)
else: #Media type Mesangem.
#Test to see if it is a sticker, because sticker has no embed caption.
if hasattr(message.media, 'document') and hasattr(message.media.document, 'attributes') and (len(message.media.document.attributes) > 1) and (type(message.media.document.attributes[1]).__name__ == 'DocumentAttributeSticker'): #"try" did not work, workaround. https://stackoverflow.com/questions/610883/how-to-know-if-an-object-has-an-attribute-in-python
sticker_or_not = message.media.document.attributes[1].alt
if sticker_or_not != '': #It is a sticker, because sticker necessarily has an emoji. Same scheme used to forward, sends the message with a reply to put the author and date.
client.send_message(entitys[i][1], message, reply_to=id_reply_backup)
if Enable_Author_and_Date == True:
client.send_message(entitys[i][1], author_date, reply_to=int(last_id_backup_temp[0]))
last_id_backup_temp[0] += 1
print(" Sticker type message with reply sent" + time_sent, file=mon_file)
else:
print(" Sticker type message sent" + time_sent, file=mon_file)
else: #Similar to sticker, but it is not sticker.
if len_author_date + len(message.message) > 1024:
client.send_message(entitys[i][1], message, reply_to=id_reply_backup)
if Enable_Author_and_Date == True:
client.send_message(entitys[i][1], author_date, reply_to=int(last_id_backup_temp[0]))
last_id_backup_temp[0] += 1
print(" Sticker-like message with reply sent" + time_sent, file=mon_file)
else:
print(" Sticker-like message sent" + time_sent, file=mon_file)
else:
message.text += author_date
client.send_message(entitys[i][1], message, reply_to=id_reply_backup)
print(" Sticker-like message sent" + time_sent, file=mon_file)
else: #Media type and non-sticker message.
if len_author_date + len(message.message) > 1024:
client.send_message(entitys[i][1], message, reply_to=id_reply_backup)
if Enable_Author_and_Date == True:
client.send_message(entitys[i][1], author_date, reply_to=int(last_id_backup_temp[0]))
last_id_backup_temp[0] += 1
print(" Non-sticker media message with reply sent" + time_sent, file=mon_file)
else:
print(" Non-sticker media message sent" + time_sent, file=mon_file)
else:
message.text += author_date
client.send_message(entitys[i][1], message, reply_to=id_reply_backup)
print(" Non-sticker media message sent" + time_sent, file=mon_file)
def send_album(id_reply_backup):
global last_id_backup_temp
global album_caption
global i_alb
if id_reply_backup is not None:
id_reply_backup = int(id_reply_backup)
#if message.fwd_from is None: #It's not a forwarded message.
#if (message.reply_to is not None) and (id_reply_backup is not None): #Reply to message not deleted.
#if len_author_date + len(album_caption) > 1024: #Text plus author and date greater than the maximum limit string size for media.
#client.send_file(entitys[i][1], album, caption=album_caption, reply_to=int(id_reply_backup))
#if Enable_Author_and_Date == True:
#client.send_message(entitys[i][1], author_date, reply_to=int(last_id_backup_temp[i_alb]))
#print(" Album with reply sent" + time_sent, file=mon_file)
#else:
#print(" Album sent" + time_sent, file=mon_file)
#else:
#album_caption += author_date
#client.send_file(entitys[i][1], album, caption=album_caption, reply_to=int(id_reply_backup))
#print(" Album sent" + time_sent, file=mon_file)
#else: #Reply to deleted message.
#if len_author_date + len(album_caption) > 1024:
#client.send_file(entitys[i][1], album, caption=album_caption)
#if Enable_Author_and_Date == True:
#client.send_message(entitys[i][1], author_date, reply_to=int(last_id_backup_temp[i_alb]))
#print(" Album reply the message erases and with reply sent" + time_sent, file=mon_file)
#else:
#print(" Album reply to message delete sent" + time_sent, file=mon_file)
#else:
#album_caption += author_date
#client.send_file(entitys[i][1], album, caption=album_caption)
#print(" Album reply to message erases sent " + time_sent, file=mon_file)
#else: #It is a message due to forwarding.
client.forward_messages(entitys[i][1], album, entitys[i][0])
if Enable_Author_and_Date == True:
client.send_message(entitys[i][1], author_date, reply_to=int(last_id_backup_temp[i_alb]))
print(" Album with reply forwarding" + time_sent, file=mon_file)
else:
print(" Album forwarding" + time_sent, file=mon_file)
def subrotine():
global wait_time #https://giovannireisnunes.wordpress.com/2017/07/28/variaveis-globais-em-python/
global add_time
global num
global last_id_temp
global last_id_backup_temp
global format
global len_author_date
global author_date
global id_reply_backup
global album
global album_caption
global time_sent
global format2
global last_id
global last_id_backup
global alb_last_id
global i_alb
num += 1
print(str(num) + '. ', end='', flush=True, file=mon_file) #This 'flush' forces the output file to be updated.
print("Message ID: ", str(message.id), file=mon_file)
sleep(wait_time)
time_sent = message.date.strftime(format2)
time_sent = " " + time_sent
#If it gives flood error and enters the recursive routine the id last variables are resets.
last_id_temp = []
last_id_backup_temp = []
last_id_temp.append(last_id[len(last_id)-1])
last_id_backup_temp.append(last_id_backup[len(last_id_backup)-1])
try:
#Check to see if it is not those messages of "date" and "entered the group", which are of the type " non-interatables". https://stackoverflow.com/questions/53467988/how-to-get-message-object-from-messageservice-object-in-telethon
if message.message is not None:
#Author and date.
author = message.post_author
date = message.date.strftime(format)
if Enable_Author_and_Date == True:
if (author is not None) and (message.fwd_from is None):
author_date = " <i>(" + author + ' ' + date + ")</i>"
else:
author_date = " <i>(" + date + ")</i>"
else:
author_date = ''
len_author_date = len(author_date)
#Check to see if it's an album.
album_old = message.grouped_id
if album_old is not None:
print(" Message ID " + str(message.id) + " belongs to the album: " + str(album_old), flush=True, file=mon_file)
album = []
album_caption = ''
album.append(message)
album_caption += message.text
i_alb = 0
last_id_temp[0] = message.id
last_id_backup_temp[0] += 1
#Just to inform if it is an answer to some deleted or not message.
if message.reply_to is not None:
try:
id_reply = message.get_reply_message().id
except AttributeError as error: #https://airbrake.io/blog/php-exception-handling/attributeerror
print(" Message ID " + str(last_id_temp[0]) + " is a reply an deleted message.", file=mon_file)
except:
print(" ERROR!")
else:
pass
print(" Message ID " + str(last_id_temp[0]) + " is a response to message ID " + str(id_reply), file=mon_file)
#Makes another iter_messages() call to sweep over the album. Scans from the second media onwards.
last_id_channel = client.get_messages(entitys[i][0], limit=1)[0].id
for message_album in client.iter_messages(entitys[i][0], reverse=True, offset_id=message.id):
album_now = message_album.grouped_id
if (album_now == album_old): #Appends while the album ID does not change, or until you reach the end of the channel (carefull with boundarys).
print(" Message ID " + str(message_album.id) + " belongs to the album: " + str(album_now), file=mon_file)
album.append(message_album)
album_caption += message_album.text
last_id_temp.append(message_album.id)
last_id_backup_temp.append(last_id_backup_temp[i_alb]+1)
i_alb += 1
if message_album.reply_to is not None:
try:
id_reply_original = message_album.get_reply_message().id
except AttributeError as error: #https://airbrake.io/blog/php-exception-handling/attributeerror
print(" Message ID " + str(last_id_temp[i_alb]) + " it is a response to a deleted message.", file=mon_file)
id_reply_backup = None
except:
print(" ERROR! Message reply to another message.")
print(" ERROR! Message reply to another message.", file=mon_file)
id_reply_backup = None
else:
pass
print(" Message ID " + str(last_id_temp[i_alb]) + " is a response to message ID " + str(id_reply_original), file=mon_file)
if str(id_reply_original) in IDs[i]:
id_reply_backup = IDs[i][str(id_reply_original)]
else:
id_reply_backup = '-1'
if (message_album.id == last_id_channel): #Particular case, the end of the album coincides with the end of the channel.
send_album(id_reply_backup)
#End of the album. Prints, exits the current iter_messages() and jumps to the outermost iter_messages().
else:
send_album(id_reply_backup)
#Get out of the loop " for"
break
else: #Not is a album, so you have just a single ID.
#Test to see if it's a forwarded message or not. If it is, it only forwards and keeps the original author, otherwise it copies and pastes and strips the link with the initial channel. The result is a copy exactly of the original channel.
last_id_temp[0] = message.id
last_id_backup_temp[0] += 1
#if message.fwd_from is None:
##This copy and paste command, which takes the link with the original channel and protects the message, because if the original channel is banned from Telegram and the backup message comes from forwarding, the message in the backup is also deleted.
#if message.reply_to is not None: #Test if it's a response to another message.
#try: #Tests if it is a response to a deleted message.
#id_reply_original = message.get_reply_message().id
#except AttributeError as error: #https://airbrake.io/blog/php-exception-handling/attributeerror
#print(" Message ID " + str(last_id_temp[0]) + " is a reply an deleted message.", file=mon_file)
#id_reply_backup = None
#send_single(id_reply_backup)
#except:
#client.send_message(entitys[i][1], "ERROR!")
#print("ERROR in test if it's a response to another message.", file=mon_file)
#else: #It's a response, but not a deleted message. The difference here from the previous case is basically only the presence of the "reply_to=int(id_reply_backup)".
#pass
#if str(id_reply_original) in IDs[i]:
#id_reply_backup = IDs[i][str(id_reply_original)]
#else:
#id_reply_backup = '-1'
#print(" Message ID " + str(last_id_temp[0]) + " is a response to message ID " + str(id_reply_original), file=mon_file)
#send_single(id_reply_backup)
#else: #It is not a response to another message.
#id_reply_backup = None
#send_single(id_reply_backup)
#else:
if type(message.media).__name__ not in messages_not_iterable: #Don't forward games or polls.
client.forward_messages(entitys[i][1], message.id, entitys[i][0])
if Enable_Author_and_Date == True:
client.send_message(entitys[i][1], author_date, reply_to=int(last_id_backup_temp[0]))
last_id_backup_temp[0] += 1
print(" Message with reply forwards" + time_sent, file=mon_file)
else:
print(" Message forwards" + time_sent, file=mon_file)
else:
print(" Message ID " + str(message.id) + " it is of the 'non-interatable' type" + time_sent, file=mon_file)
except errors.FloodWaitError as e: #Tests the various possible errors.
wait_time += add_time
print('FLOODING ERROR! Necessary to wait ' + str(e.seconds) + ' seconds to continue. Waiting time increased in ' + str(add_time) + ' seconds. Now the waiting time is ' + str(wait_time) + ' seconds.', flush=True, file=mon_file)
sleep(e.seconds)
subrotine() #Recursive call of the routine.
except Exception as e:
client.send_message(entitys[i][1], e)
print("ERROR!", file=mon_file)
print(e, file=mon_file)
except:
client.send_message(entitys[i][1], 'ERROR!')
print("UNKNOWN ERROR!", file=mon_file)
else:
pass
#Prints the ID pairs, between the original channel and the backup channel, only if nothing wrong happens to avoid desynchronization.
last_id = []
last_id_backup = []
last_id = last_id_temp
last_id_backup = last_id_backup_temp
for ii_id in range(len(last_id)):
IDs[i][str(last_id[ii_id])] = str(last_id_backup[ii_id])
print(str(last_id[ii_id]) + ' ' + str(last_id_backup[ii_id]), flush=True, file=f_ID[i])
alb_last_id = last_id[ii_id]
for i in range(num_entitys):
num = 0
last_id = [0]
last_id_backup = []
#Takes the last ID of the last message posted, even if it was deleted.
channel_connect = client.get_input_entity(entitys[i][1])
channel_full_info = client(GetFullChannelRequest(channel=channel_connect))
last_id_backup.append(channel_full_info.full_chat.read_inbox_max_id)
print("Backing up the entity: " + entitys[i][0], file=mon_file)
time1 = datetime.now()
time = time1.strftime("%m-%d-%Y at %Hhr %Mmin %Ssec")
print("Start: " + time, file=mon_file)
print("", file=mon_file)
for message in client.iter_messages(entitys[i][0], reverse=True, offset_id=ID_last[i]): #offset_id is a open boundary type.
if message.id > alb_last_id: #Skip album to the last media.
subrotine()
time2 = datetime.now()
time = time2.strftime("%m-%d-%Y at %Hhr %Mmin %Ssec")
print("", file=mon_file)
print("End of entity backup: " + entitys[i][0], file=mon_file)
print("End: " + time, file=mon_file)
duration = time2 - time1
duration_in_s = duration.total_seconds() #https://stackoverflow.com/a/47207182/5175660
days = divmod(duration_in_s, 86400)
hours = divmod(days[1], 3600)
minutes = divmod(hours[1], 60)
print("Time spent: %d days, %d hours, %d minutes" % (days[0], hours[0], minutes[0]), file=mon_file)
print("-------------------------------------------------------", file=mon_file)