School/december/task_081221/task_081221_homework_v1.1.py
2022-08-02 20:34:11 +03:00

159 lines
5.2 KiB
Python

# Date - 15.12.2021
# Author - Kristiāns Francis Cagulis
# Title - Homework Selenium v1.1
#
# Changes
# Now every page doesņ't open in new browser window. After all files from page are dowloaded it moves to next page.
import os
from os.path import exists
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
debug = False # autofills console inputs
chromium = ["1", "chromium", "chrome"]
firefox = ["2", "firefox"]
def get_data():
user_browser, user_pages, search_word, last_page = get_user_input()
if user_browser in chromium:
if os.name in ('nt', 'dos'):
browser = webdriver.Chrome("chromedriver.exe") # windows
else:
browser = webdriver.Chrome("chromedriver") # gnu/linux
elif user_browser in firefox:
browser = webdriver.Firefox()
url = "https://www.riga.lv/lv/"
browser.get(url)
browser.find_element(By.CLASS_NAME, 'cookie-accept-all').click()
browser.find_element(By.CLASS_NAME, 'search-link').click()
delay = 2
WebDriverWait(browser, delay).until(
EC.presence_of_all_elements_located((By.ID, 'edit-search')))
search = browser.find_element(By.ID, 'edit-search')
search.send_keys(search_word) # writes in search line
browser.find_element(By.ID, 'search-header-button').click()
WebDriverWait(browser, delay).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, 'filter-content')))
WebDriverWait(browser, delay).until(
EC.presence_of_all_elements_located((By.ID, 'filter_type_file')))
browser.find_element(
By.CSS_SELECTOR, 'label[for="filter_type_file"]').click()
browser.find_element(By.ID, 'search-view-button').click()
for current_page in range(1, last_page + 1):
if current_page in user_pages:
files = browser.find_elements(By.CLASS_NAME, 'file')
for file in files:
file_name = file.text
file_url = file.get_attribute('href')
file_download(file_name, file_url)
if current_page != last_page:
browser.find_element(By.CLASS_NAME, 'pager__item--next').click()
if current_page == user_pages[-1]:
break
browser.quit()
def get_user_input():
if debug == True:
search_word = "dokum"
else:
search_word = input("Choose keyword to search: ")
last_page = get_max_page_amount(search_word)
print("\nChoose which browser to use:")
print("1 - chromium (chrome)")
print("2 - firefox")
if debug == True:
browser = "firefox"
else:
browser = input("").lower()
print(
f"\nChoose from which pages you want to download files (1 4 7; 2-5; all). Maximum is {last_page} pages.")
try:
if debug == True:
user_input = "16-17"
else:
user_input = input("").lower()
if user_input == "all":
# creates list with all pages
pages = list(map(int, range(1, last_page + 1)))
else:
user_page_list = user_input.split(" ")
for page_range in user_page_list:
if "-" in page_range:
# gets first number
first_num = int(page_range[:page_range.find("-")])
# gets second number
second_num = int(page_range[page_range.find("-") + 1:]) + 1
if second_num > last_page: # reduces user input to max page amount
second_num = last_page
user_page_list = user_page_list + \
list(map(str, range(first_num, second_num + 1))
) # creates list with str range
# removes all elements containing "-"
pages = [elem for elem in user_page_list if not "-" in elem]
pages = list(map(int, pages)) # convers str to int
pages.sort() # sorts list
pages = list(set(pages)) # removes duplicates from list
except:
print("Enered incorrect number/s. Try again.")
return browser, pages, search_word, last_page
def get_max_page_amount(keyword: str):
url = f"https://www.riga.lv/lv/search?q={keyword}&types=file"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
try:
last_page = soup.find(class_='pager__item--last').get_text().strip()
except:
try:
last_page = soup.find_all(class_='pager__item page-item')
# gets last number from navigation bar
last_page = last_page[-1].get_text().strip()[-1]
except:
print("Something went wrong. Please try again or try another keyword.")
return int(last_page)
def file_download(file_name, file_url):
print(f"\nNAME: {file_name}")
print(f"URL: {file_url}")
path = "files"
if not exists(path):
os.mkdir(path)
response = requests.get(file_url)
if ".pdf" in file_name:
open(f"{path}/{file_name}", "wb").write(response.content)
else:
open(f"{path}/{file_name}.pdf", "wb").write(response.content)
def main():
get_data()
if __name__ == '__main__':
main()