mirror of
https://github.com/kristoferssolo/School.git
synced 2025-10-21 20:10:38 +00:00
159 lines
5.2 KiB
Python
159 lines
5.2 KiB
Python
# Date - 15.12.2021
|
|
# Author - Kristiāns Francis Cagulis
|
|
# Title - Homework Selenium v1.1
|
|
#
|
|
# Changes
|
|
# Now every page doesņ't open in new browser window. After all files from page are dowloaded it moves to next page.
|
|
|
|
import os
|
|
from os.path import exists
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from selenium import webdriver
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
|
|
debug = False # autofills console inputs
|
|
|
|
chromium = ["1", "chromium", "chrome"]
|
|
firefox = ["2", "firefox"]
|
|
|
|
|
|
def get_data():
|
|
user_browser, user_pages, search_word, last_page = get_user_input()
|
|
|
|
if user_browser in chromium:
|
|
if os.name in ('nt', 'dos'):
|
|
browser = webdriver.Chrome("chromedriver.exe") # windows
|
|
else:
|
|
browser = webdriver.Chrome("chromedriver") # gnu/linux
|
|
elif user_browser in firefox:
|
|
browser = webdriver.Firefox()
|
|
url = "https://www.riga.lv/lv/"
|
|
browser.get(url)
|
|
browser.find_element(By.CLASS_NAME, 'cookie-accept-all').click()
|
|
|
|
browser.find_element(By.CLASS_NAME, 'search-link').click()
|
|
|
|
delay = 2
|
|
WebDriverWait(browser, delay).until(
|
|
EC.presence_of_all_elements_located((By.ID, 'edit-search')))
|
|
search = browser.find_element(By.ID, 'edit-search')
|
|
search.send_keys(search_word) # writes in search line
|
|
|
|
browser.find_element(By.ID, 'search-header-button').click()
|
|
|
|
WebDriverWait(browser, delay).until(
|
|
EC.presence_of_all_elements_located((By.CLASS_NAME, 'filter-content')))
|
|
WebDriverWait(browser, delay).until(
|
|
EC.presence_of_all_elements_located((By.ID, 'filter_type_file')))
|
|
|
|
browser.find_element(
|
|
By.CSS_SELECTOR, 'label[for="filter_type_file"]').click()
|
|
|
|
browser.find_element(By.ID, 'search-view-button').click()
|
|
for current_page in range(1, last_page + 1):
|
|
if current_page in user_pages:
|
|
files = browser.find_elements(By.CLASS_NAME, 'file')
|
|
for file in files:
|
|
file_name = file.text
|
|
file_url = file.get_attribute('href')
|
|
file_download(file_name, file_url)
|
|
if current_page != last_page:
|
|
browser.find_element(By.CLASS_NAME, 'pager__item--next').click()
|
|
if current_page == user_pages[-1]:
|
|
break
|
|
browser.quit()
|
|
|
|
|
|
def get_user_input():
|
|
if debug == True:
|
|
search_word = "dokum"
|
|
else:
|
|
search_word = input("Choose keyword to search: ")
|
|
|
|
last_page = get_max_page_amount(search_word)
|
|
print("\nChoose which browser to use:")
|
|
print("1 - chromium (chrome)")
|
|
print("2 - firefox")
|
|
|
|
if debug == True:
|
|
browser = "firefox"
|
|
else:
|
|
browser = input("").lower()
|
|
|
|
print(
|
|
f"\nChoose from which pages you want to download files (1 4 7; 2-5; all). Maximum is {last_page} pages.")
|
|
try:
|
|
if debug == True:
|
|
user_input = "16-17"
|
|
else:
|
|
user_input = input("").lower()
|
|
|
|
if user_input == "all":
|
|
# creates list with all pages
|
|
pages = list(map(int, range(1, last_page + 1)))
|
|
else:
|
|
user_page_list = user_input.split(" ")
|
|
for page_range in user_page_list:
|
|
if "-" in page_range:
|
|
|
|
# gets first number
|
|
first_num = int(page_range[:page_range.find("-")])
|
|
# gets second number
|
|
second_num = int(page_range[page_range.find("-") + 1:]) + 1
|
|
|
|
if second_num > last_page: # reduces user input to max page amount
|
|
second_num = last_page
|
|
user_page_list = user_page_list + \
|
|
list(map(str, range(first_num, second_num + 1))
|
|
) # creates list with str range
|
|
# removes all elements containing "-"
|
|
pages = [elem for elem in user_page_list if not "-" in elem]
|
|
pages = list(map(int, pages)) # convers str to int
|
|
pages.sort() # sorts list
|
|
pages = list(set(pages)) # removes duplicates from list
|
|
except:
|
|
print("Enered incorrect number/s. Try again.")
|
|
return browser, pages, search_word, last_page
|
|
|
|
|
|
def get_max_page_amount(keyword: str):
|
|
url = f"https://www.riga.lv/lv/search?q={keyword}&types=file"
|
|
page = requests.get(url)
|
|
soup = BeautifulSoup(page.content, 'html.parser')
|
|
try:
|
|
last_page = soup.find(class_='pager__item--last').get_text().strip()
|
|
except:
|
|
try:
|
|
last_page = soup.find_all(class_='pager__item page-item')
|
|
# gets last number from navigation bar
|
|
last_page = last_page[-1].get_text().strip()[-1]
|
|
except:
|
|
print("Something went wrong. Please try again or try another keyword.")
|
|
return int(last_page)
|
|
|
|
|
|
def file_download(file_name, file_url):
|
|
print(f"\nNAME: {file_name}")
|
|
print(f"URL: {file_url}")
|
|
|
|
path = "files"
|
|
if not exists(path):
|
|
os.mkdir(path)
|
|
|
|
response = requests.get(file_url)
|
|
if ".pdf" in file_name:
|
|
open(f"{path}/{file_name}", "wb").write(response.content)
|
|
else:
|
|
open(f"{path}/{file_name}.pdf", "wb").write(response.content)
|
|
|
|
|
|
def main():
|
|
get_data()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|