Selenium scraper - done

This commit is contained in:
Kristofers Solo
2021-12-14 23:46:48 +02:00
parent 8e7078887b
commit 49ec9d62a4
7 changed files with 1381 additions and 65 deletions

View File

@@ -10,7 +10,7 @@ from io import BytesIO
from openpyxl.styles import Font, Alignment
import openpyxl
HEADERS = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36 Vivaldi/4.1.2369.21'}
HEADERS = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.97 Safari/537.36 Vivaldi/4.1.2369.21'}
class SS:

File diff suppressed because it is too large Load Diff

View File

@@ -1,67 +1,118 @@
# Date - 11.12.2021
# Date - 14.12.2021
# Author - Kristiāns Francis Cagulis
# Title - Homework Selenium
import time
import os
from os.path import exists
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
debug = False
chromium = ["1", "chromium", "chrome"]
firefox = ["2", "firefox"]
def get_data():
user_browser, user_pages, search_word = get_user_input()
for page in user_pages:
if user_browser in chromium:
if os.name in ('nt', 'dos'):
browser = webdriver.Chrome("chromedriver.exe") # windows
else:
browser = webdriver.Chrome("chromedriver") # gnu/linux
elif user_browser in firefox:
browser = webdriver.Firefox()
url = f"https://www.riga.lv/lv/search?q={search_word}&types=file&page={page - 1}"
browser.get(url)
browser.find_element(By.CLASS_NAME, 'cookie-accept-all').click()
files = browser.find_elements(By.CLASS_NAME, 'file')
for file in files:
file_name = file.text
file_url = file.get_attribute('href')
file_download(file_name, file_url)
browser.quit()
def get_user_input():
print("Choose which browser to use:")
if debug == True:
search_word = "dokum"
else:
search_word = input("Choose keyword to search: ")
last_page = get_max_page_amount(search_word)
print("\nChoose which browser to use:")
print("1 - chromium (chrome)")
print("2 - firefox")
browser = input("").lower()
print("Choose from which pages you want to download files (1 4 7; 2-5; all)")
pages = input("").lower()
return browser, pages
if debug == True:
browser = "firefox"
else:
browser = input("").lower()
print(f"\nChoose from which pages you want to download files (1 4 7; 2-5; all). Maximum is {last_page} pages.")
try:
if debug == True:
user_input = "1"
else:
user_input = input("").lower()
if user_input == "all":
pages = list(map(int, range(1, last_page + 1))) # creates list with all pages
else:
user_page_list = user_input.split(" ")
for page_range in user_page_list:
if "-" in page_range:
first_num = int(page_range[:page_range.find("-")]) # gets first number
second_num = int(page_range[page_range.find("-") + 1:]) + 1 # gets second number
if second_num > last_page: # reduces user input to max page amount
second_num = last_page
user_page_list = user_page_list + list(map(str, range(first_num, second_num))) # creates list with str range
pages = [elem for elem in user_page_list if not "-" in elem] # removes all elements containing "-"
pages = list(map(int, pages)) # convers str to int
pages.sort() # sorts list
pages = list(set(pages)) # removes duplicates from list
except:
print("Enered incorrect number/s. Try again.")
return browser, pages, search_word
def get_data():
user_browser, user_pages = get_user_input()
if user_browser in chromium:
def get_max_page_amount(keyword: str):
url = f"https://www.riga.lv/lv/search?q={keyword}&types=file"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
try:
last_page = soup.find(class_='pager__item--last').get_text().strip()
except:
try:
browser = webdriver.Chrome("chromedriver.exe") # windows
last_page = soup.find_all(class_='pager__item page-item')
last_page = last_page[-1].get_text().strip()[-1] # gets last number from navigation bar
except:
browser = webdriver.Chrome("chromedriver") # gnu/linux
elif user_browser in firefox:
profile = webdriver.FirefoxProfile()
profile.set_preference('browser.download.folderList', 2) # custom location
profile.set_preference('browser.download.manager.showWhenStarting', False)
profile.set_preference('browser.download.dir', '/files')
profile.set_preference('browser.helpApps.neverAsk.saveToDisk', 'text/csv')
browser = webdriver.Firefox(profile)
print("Something went wrong. Please try again or try another keyword.")
return int(last_page)
address = "https://www.riga.lv/lv"
browser.get(address)
# time.sleep(3)
browser.find_element_by_class_name('cookie-accept-all').click()
browser.find_element_by_class_name('search-link').click()
def file_download(file_name, file_url):
print(f"\nNAME: {file_name}")
print(f"URL: {file_url}")
delay = 0
WebDriverWait(browser, delay).until(EC.presence_of_all_elements_located((By.ID, 'edit-search')))
browser.find_element_by_id('edit-search').send_keys("dokum") # writes in search line
path = "files"
if not exists(path):
os.mkdir(path)
browser.find_element_by_id('search-header-button').click()
browser.maximize_window()
WebDriverWait(browser, delay).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'filter-content')))
# delay = 3
WebDriverWait(browser, delay).until(EC.presence_of_all_elements_located((By.ID, 'filter_type_file')))
browser.find_element_by_css_selector('label[for="filter_type_file"]').click()
browser.find_element_by_id('search-view-button').click()
file = browser.find_element_by_class_name('file')
response = requests.get(file_url)
if ".pdf" in file_name:
open(f"{path}/{file_name}", "wb").write(response.content)
else:
open(f"{path}/{file_name}.pdf", "wb").write(response.content)
def main():

View File

@@ -1,21 +0,0 @@
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.keys import Keys
#browser exposes an executable file
#Through Selenium test we will invoke the executable file which will then
#invoke actual browser
driver = webdriver.Chrome(executable_path="chromedriver")
# to maximize the browser window
driver.maximize_window()
#get method to launch the URL
driver.get("https://www.tutorialspoint.com/about/about_careers.htm")
#to refresh the browser
driver.refresh()
# identifying the source element
source = driver.find_element_by_xpath("//*[text()='Company']")
# action chain object creation
action = ActionChains(driver)
# right click operation and then perform
action.context_click(source).perform()
#to close the browser
# driver.close()

View File

@@ -0,0 +1,26 @@
def main():
try:
user_input = input("Input: ")
user_input_array = user_input.split(" ")
if user_input == "all":
pages = list(map(int, range(1, 17 + 1)))
else:
for page_range in user_input_array:
if "-" in page_range:
first_num = int(page_range[:page_range.find("-")]) # gets first number
second_num = int(page_range[page_range.find("-") + 1:]) + 1 # gets second number
user_input_array = user_input_array + list(map(str, range(first_num, second_num))) # creates list with str range
pages = [elem for elem in user_input_array if not "-" in elem] # removes all elements containing "-"
pages = list(map(int, pages)) # convers str to int
pages.sort() # sorts list
pages = list(set(pages)) # removes duplicates from list
print(pages)
except:
print("Something went wrong. Try again.")
if __name__ == '__main__':
main()
# 3 1 5 2 7-11 3-30