Selenium scraper - done

This commit is contained in:
Kristofers Solo 2021-12-14 23:46:48 +02:00
parent 8e7078887b
commit 49ec9d62a4
7 changed files with 1381 additions and 65 deletions

4
.gitignore vendored
View File

@ -1,4 +1,6 @@
.vscode/ .vscode/
**/__pycache__/ **/__pycache__/
**/.venv/ **/.venv/
**.log **.log
/december/task_081221/files
/december/task_081221/*.log

View File

@ -10,7 +10,7 @@ from io import BytesIO
from openpyxl.styles import Font, Alignment from openpyxl.styles import Font, Alignment
import openpyxl import openpyxl
HEADERS = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36 Vivaldi/4.1.2369.21'} HEADERS = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.97 Safari/537.36 Vivaldi/4.1.2369.21'}
class SS: class SS:

File diff suppressed because it is too large Load Diff

View File

@ -1,67 +1,118 @@
# Date - 11.12.2021 # Date - 14.12.2021
# Author - Kristiāns Francis Cagulis # Author - Kristiāns Francis Cagulis
# Title - Homework Selenium # Title - Homework Selenium
import time import os
from os.path import exists
import requests
from bs4 import BeautifulSoup
from selenium import webdriver from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
debug = False
chromium = ["1", "chromium", "chrome"] chromium = ["1", "chromium", "chrome"]
firefox = ["2", "firefox"] firefox = ["2", "firefox"]
def get_data():
user_browser, user_pages, search_word = get_user_input()
for page in user_pages:
if user_browser in chromium:
if os.name in ('nt', 'dos'):
browser = webdriver.Chrome("chromedriver.exe") # windows
else:
browser = webdriver.Chrome("chromedriver") # gnu/linux
elif user_browser in firefox:
browser = webdriver.Firefox()
url = f"https://www.riga.lv/lv/search?q={search_word}&types=file&page={page - 1}"
browser.get(url)
browser.find_element(By.CLASS_NAME, 'cookie-accept-all').click()
files = browser.find_elements(By.CLASS_NAME, 'file')
for file in files:
file_name = file.text
file_url = file.get_attribute('href')
file_download(file_name, file_url)
browser.quit()
def get_user_input(): def get_user_input():
print("Choose which browser to use:") if debug == True:
search_word = "dokum"
else:
search_word = input("Choose keyword to search: ")
last_page = get_max_page_amount(search_word)
print("\nChoose which browser to use:")
print("1 - chromium (chrome)") print("1 - chromium (chrome)")
print("2 - firefox") print("2 - firefox")
browser = input("").lower()
print("Choose from which pages you want to download files (1 4 7; 2-5; all)") if debug == True:
pages = input("").lower() browser = "firefox"
return browser, pages else:
browser = input("").lower()
print(f"\nChoose from which pages you want to download files (1 4 7; 2-5; all). Maximum is {last_page} pages.")
try:
if debug == True:
user_input = "1"
else:
user_input = input("").lower()
if user_input == "all":
pages = list(map(int, range(1, last_page + 1))) # creates list with all pages
else:
user_page_list = user_input.split(" ")
for page_range in user_page_list:
if "-" in page_range:
first_num = int(page_range[:page_range.find("-")]) # gets first number
second_num = int(page_range[page_range.find("-") + 1:]) + 1 # gets second number
if second_num > last_page: # reduces user input to max page amount
second_num = last_page
user_page_list = user_page_list + list(map(str, range(first_num, second_num))) # creates list with str range
pages = [elem for elem in user_page_list if not "-" in elem] # removes all elements containing "-"
pages = list(map(int, pages)) # convers str to int
pages.sort() # sorts list
pages = list(set(pages)) # removes duplicates from list
except:
print("Enered incorrect number/s. Try again.")
return browser, pages, search_word
def get_data(): def get_max_page_amount(keyword: str):
user_browser, user_pages = get_user_input() url = f"https://www.riga.lv/lv/search?q={keyword}&types=file"
if user_browser in chromium: page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
try:
last_page = soup.find(class_='pager__item--last').get_text().strip()
except:
try: try:
browser = webdriver.Chrome("chromedriver.exe") # windows last_page = soup.find_all(class_='pager__item page-item')
last_page = last_page[-1].get_text().strip()[-1] # gets last number from navigation bar
except: except:
browser = webdriver.Chrome("chromedriver") # gnu/linux print("Something went wrong. Please try again or try another keyword.")
elif user_browser in firefox: return int(last_page)
profile = webdriver.FirefoxProfile()
profile.set_preference('browser.download.folderList', 2) # custom location
profile.set_preference('browser.download.manager.showWhenStarting', False)
profile.set_preference('browser.download.dir', '/files')
profile.set_preference('browser.helpApps.neverAsk.saveToDisk', 'text/csv')
browser = webdriver.Firefox(profile)
address = "https://www.riga.lv/lv"
browser.get(address)
# time.sleep(3)
browser.find_element_by_class_name('cookie-accept-all').click()
browser.find_element_by_class_name('search-link').click() def file_download(file_name, file_url):
print(f"\nNAME: {file_name}")
print(f"URL: {file_url}")
delay = 0 path = "files"
WebDriverWait(browser, delay).until(EC.presence_of_all_elements_located((By.ID, 'edit-search'))) if not exists(path):
browser.find_element_by_id('edit-search').send_keys("dokum") # writes in search line os.mkdir(path)
browser.find_element_by_id('search-header-button').click() response = requests.get(file_url)
if ".pdf" in file_name:
browser.maximize_window() open(f"{path}/{file_name}", "wb").write(response.content)
WebDriverWait(browser, delay).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'filter-content'))) else:
# delay = 3 open(f"{path}/{file_name}.pdf", "wb").write(response.content)
WebDriverWait(browser, delay).until(EC.presence_of_all_elements_located((By.ID, 'filter_type_file')))
browser.find_element_by_css_selector('label[for="filter_type_file"]').click()
browser.find_element_by_id('search-view-button').click()
file = browser.find_element_by_class_name('file')
def main(): def main():

View File

@ -1,21 +0,0 @@
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.keys import Keys
#browser exposes an executable file
#Through Selenium test we will invoke the executable file which will then
#invoke actual browser
driver = webdriver.Chrome(executable_path="chromedriver")
# to maximize the browser window
driver.maximize_window()
#get method to launch the URL
driver.get("https://www.tutorialspoint.com/about/about_careers.htm")
#to refresh the browser
driver.refresh()
# identifying the source element
source = driver.find_element_by_xpath("//*[text()='Company']")
# action chain object creation
action = ActionChains(driver)
# right click operation and then perform
action.context_click(source).perform()
#to close the browser
# driver.close()

View File

@ -0,0 +1,26 @@
def main():
try:
user_input = input("Input: ")
user_input_array = user_input.split(" ")
if user_input == "all":
pages = list(map(int, range(1, 17 + 1)))
else:
for page_range in user_input_array:
if "-" in page_range:
first_num = int(page_range[:page_range.find("-")]) # gets first number
second_num = int(page_range[page_range.find("-") + 1:]) + 1 # gets second number
user_input_array = user_input_array + list(map(str, range(first_num, second_num))) # creates list with str range
pages = [elem for elem in user_input_array if not "-" in elem] # removes all elements containing "-"
pages = list(map(int, pages)) # convers str to int
pages.sort() # sorts list
pages = list(set(pages)) # removes duplicates from list
print(pages)
except:
print("Something went wrong. Try again.")
if __name__ == '__main__':
main()
# 3 1 5 2 7-11 3-30

View File

@ -39,3 +39,57 @@ wsproto==1.0.0
xlrd==2.0.1 xlrd==2.0.1
xlwt==1.3.0 xlwt==1.3.0
yapf==0.31.0 yapf==0.31.0
async-generator==1.10
attrs==21.2.0
beautifulsoup4==4.10.0
bs4==0.0.1
certifi==2021.10.8
cffi==1.15.0
charset-normalizer==2.0.8
cryptography==36.0.0
cycler==0.11.0
et-xmlfile==1.1.0
filelock==3.4.0
fonttools==4.28.2
geckodriver-autoinstaller==0.1.0
h11==0.12.0
idna==3.3
jsoncodable==0.1.7
jsonpickle==2.0.0
k-selenium-cookies==0.0.4
kiwisolver==1.3.2
kproxy==0.0.1
matplotlib==3.5.0
noraise==0.0.16
numpy==1.21.4
openpyxl==3.0.9
outcome==1.1.0
packaging==21.3
pandas==1.3.4
Pillow==8.4.0
pycparser==2.21
pyOpenSSL==21.0.0
pyparsing==3.0.6
python-dateutil==2.8.2
pytz==2021.3
requests==2.26.0
requests-file==1.5.1
selenium==4.1.0
selenium-browser==0.0.15
setuptools-scm==6.3.2
six==1.16.0
sniffio==1.2.0
sortedcontainers==2.4.0
soupsieve==2.3.1
tldextract==3.1.2
tomli==1.2.2
trio==0.19.0
trio-websocket==0.9.2
urllib3==1.26.7
wget==3.2
wsproto==1.0.0
xlrd==2.0.1
XlsxWriter==3.0.2
xlwt==1.3.0
xpath-utils==0.0.3
yapf==0.31.0