mirror of
https://github.com/kristoferssolo/School.git
synced 2025-10-21 20:10:38 +00:00
Selenium scraper - done
This commit is contained in:
parent
8e7078887b
commit
49ec9d62a4
4
.gitignore
vendored
4
.gitignore
vendored
@ -1,4 +1,6 @@
|
|||||||
.vscode/
|
.vscode/
|
||||||
**/__pycache__/
|
**/__pycache__/
|
||||||
**/.venv/
|
**/.venv/
|
||||||
**.log
|
**.log
|
||||||
|
/december/task_081221/files
|
||||||
|
/december/task_081221/*.log
|
||||||
@ -10,7 +10,7 @@ from io import BytesIO
|
|||||||
from openpyxl.styles import Font, Alignment
|
from openpyxl.styles import Font, Alignment
|
||||||
import openpyxl
|
import openpyxl
|
||||||
|
|
||||||
HEADERS = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36 Vivaldi/4.1.2369.21'}
|
HEADERS = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.97 Safari/537.36 Vivaldi/4.1.2369.21'}
|
||||||
|
|
||||||
|
|
||||||
class SS:
|
class SS:
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -1,67 +1,118 @@
|
|||||||
# Date - 11.12.2021
|
# Date - 14.12.2021
|
||||||
# Author - Kristiāns Francis Cagulis
|
# Author - Kristiāns Francis Cagulis
|
||||||
# Title - Homework Selenium
|
# Title - Homework Selenium
|
||||||
|
|
||||||
import time
|
import os
|
||||||
|
from os.path import exists
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
from selenium.webdriver.support.ui import WebDriverWait
|
|
||||||
from selenium.webdriver.support import expected_conditions as EC
|
|
||||||
from selenium.webdriver.common.by import By
|
from selenium.webdriver.common.by import By
|
||||||
|
|
||||||
|
debug = False
|
||||||
|
|
||||||
chromium = ["1", "chromium", "chrome"]
|
chromium = ["1", "chromium", "chrome"]
|
||||||
firefox = ["2", "firefox"]
|
firefox = ["2", "firefox"]
|
||||||
|
|
||||||
|
|
||||||
|
def get_data():
|
||||||
|
user_browser, user_pages, search_word = get_user_input()
|
||||||
|
|
||||||
|
for page in user_pages:
|
||||||
|
if user_browser in chromium:
|
||||||
|
if os.name in ('nt', 'dos'):
|
||||||
|
browser = webdriver.Chrome("chromedriver.exe") # windows
|
||||||
|
else:
|
||||||
|
browser = webdriver.Chrome("chromedriver") # gnu/linux
|
||||||
|
elif user_browser in firefox:
|
||||||
|
browser = webdriver.Firefox()
|
||||||
|
|
||||||
|
url = f"https://www.riga.lv/lv/search?q={search_word}&types=file&page={page - 1}"
|
||||||
|
browser.get(url)
|
||||||
|
browser.find_element(By.CLASS_NAME, 'cookie-accept-all').click()
|
||||||
|
|
||||||
|
files = browser.find_elements(By.CLASS_NAME, 'file')
|
||||||
|
for file in files:
|
||||||
|
file_name = file.text
|
||||||
|
file_url = file.get_attribute('href')
|
||||||
|
file_download(file_name, file_url)
|
||||||
|
browser.quit()
|
||||||
|
|
||||||
|
|
||||||
def get_user_input():
|
def get_user_input():
|
||||||
print("Choose which browser to use:")
|
if debug == True:
|
||||||
|
search_word = "dokum"
|
||||||
|
else:
|
||||||
|
search_word = input("Choose keyword to search: ")
|
||||||
|
|
||||||
|
last_page = get_max_page_amount(search_word)
|
||||||
|
print("\nChoose which browser to use:")
|
||||||
print("1 - chromium (chrome)")
|
print("1 - chromium (chrome)")
|
||||||
print("2 - firefox")
|
print("2 - firefox")
|
||||||
browser = input("").lower()
|
|
||||||
|
|
||||||
print("Choose from which pages you want to download files (1 4 7; 2-5; all)")
|
if debug == True:
|
||||||
pages = input("").lower()
|
browser = "firefox"
|
||||||
return browser, pages
|
else:
|
||||||
|
browser = input("").lower()
|
||||||
|
|
||||||
|
print(f"\nChoose from which pages you want to download files (1 4 7; 2-5; all). Maximum is {last_page} pages.")
|
||||||
|
try:
|
||||||
|
if debug == True:
|
||||||
|
user_input = "1"
|
||||||
|
else:
|
||||||
|
user_input = input("").lower()
|
||||||
|
|
||||||
|
if user_input == "all":
|
||||||
|
pages = list(map(int, range(1, last_page + 1))) # creates list with all pages
|
||||||
|
else:
|
||||||
|
user_page_list = user_input.split(" ")
|
||||||
|
for page_range in user_page_list:
|
||||||
|
if "-" in page_range:
|
||||||
|
|
||||||
|
first_num = int(page_range[:page_range.find("-")]) # gets first number
|
||||||
|
second_num = int(page_range[page_range.find("-") + 1:]) + 1 # gets second number
|
||||||
|
|
||||||
|
if second_num > last_page: # reduces user input to max page amount
|
||||||
|
second_num = last_page
|
||||||
|
|
||||||
|
user_page_list = user_page_list + list(map(str, range(first_num, second_num))) # creates list with str range
|
||||||
|
pages = [elem for elem in user_page_list if not "-" in elem] # removes all elements containing "-"
|
||||||
|
pages = list(map(int, pages)) # convers str to int
|
||||||
|
pages.sort() # sorts list
|
||||||
|
pages = list(set(pages)) # removes duplicates from list
|
||||||
|
except:
|
||||||
|
print("Enered incorrect number/s. Try again.")
|
||||||
|
return browser, pages, search_word
|
||||||
|
|
||||||
|
|
||||||
def get_data():
|
def get_max_page_amount(keyword: str):
|
||||||
user_browser, user_pages = get_user_input()
|
url = f"https://www.riga.lv/lv/search?q={keyword}&types=file"
|
||||||
if user_browser in chromium:
|
page = requests.get(url)
|
||||||
|
soup = BeautifulSoup(page.content, 'html.parser')
|
||||||
|
try:
|
||||||
|
last_page = soup.find(class_='pager__item--last').get_text().strip()
|
||||||
|
except:
|
||||||
try:
|
try:
|
||||||
browser = webdriver.Chrome("chromedriver.exe") # windows
|
last_page = soup.find_all(class_='pager__item page-item')
|
||||||
|
last_page = last_page[-1].get_text().strip()[-1] # gets last number from navigation bar
|
||||||
except:
|
except:
|
||||||
browser = webdriver.Chrome("chromedriver") # gnu/linux
|
print("Something went wrong. Please try again or try another keyword.")
|
||||||
elif user_browser in firefox:
|
return int(last_page)
|
||||||
profile = webdriver.FirefoxProfile()
|
|
||||||
profile.set_preference('browser.download.folderList', 2) # custom location
|
|
||||||
profile.set_preference('browser.download.manager.showWhenStarting', False)
|
|
||||||
profile.set_preference('browser.download.dir', '/files')
|
|
||||||
profile.set_preference('browser.helpApps.neverAsk.saveToDisk', 'text/csv')
|
|
||||||
browser = webdriver.Firefox(profile)
|
|
||||||
|
|
||||||
address = "https://www.riga.lv/lv"
|
|
||||||
browser.get(address)
|
|
||||||
# time.sleep(3)
|
|
||||||
browser.find_element_by_class_name('cookie-accept-all').click()
|
|
||||||
|
|
||||||
browser.find_element_by_class_name('search-link').click()
|
def file_download(file_name, file_url):
|
||||||
|
print(f"\nNAME: {file_name}")
|
||||||
|
print(f"URL: {file_url}")
|
||||||
|
|
||||||
delay = 0
|
path = "files"
|
||||||
WebDriverWait(browser, delay).until(EC.presence_of_all_elements_located((By.ID, 'edit-search')))
|
if not exists(path):
|
||||||
browser.find_element_by_id('edit-search').send_keys("dokum") # writes in search line
|
os.mkdir(path)
|
||||||
|
|
||||||
browser.find_element_by_id('search-header-button').click()
|
response = requests.get(file_url)
|
||||||
|
if ".pdf" in file_name:
|
||||||
browser.maximize_window()
|
open(f"{path}/{file_name}", "wb").write(response.content)
|
||||||
WebDriverWait(browser, delay).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'filter-content')))
|
else:
|
||||||
# delay = 3
|
open(f"{path}/{file_name}.pdf", "wb").write(response.content)
|
||||||
WebDriverWait(browser, delay).until(EC.presence_of_all_elements_located((By.ID, 'filter_type_file')))
|
|
||||||
|
|
||||||
browser.find_element_by_css_selector('label[for="filter_type_file"]').click()
|
|
||||||
|
|
||||||
browser.find_element_by_id('search-view-button').click()
|
|
||||||
|
|
||||||
file = browser.find_element_by_class_name('file')
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|||||||
@ -1,21 +0,0 @@
|
|||||||
from selenium import webdriver
|
|
||||||
from selenium.webdriver import ActionChains
|
|
||||||
from selenium.webdriver.common.keys import Keys
|
|
||||||
#browser exposes an executable file
|
|
||||||
#Through Selenium test we will invoke the executable file which will then
|
|
||||||
#invoke actual browser
|
|
||||||
driver = webdriver.Chrome(executable_path="chromedriver")
|
|
||||||
# to maximize the browser window
|
|
||||||
driver.maximize_window()
|
|
||||||
#get method to launch the URL
|
|
||||||
driver.get("https://www.tutorialspoint.com/about/about_careers.htm")
|
|
||||||
#to refresh the browser
|
|
||||||
driver.refresh()
|
|
||||||
# identifying the source element
|
|
||||||
source = driver.find_element_by_xpath("//*[text()='Company']")
|
|
||||||
# action chain object creation
|
|
||||||
action = ActionChains(driver)
|
|
||||||
# right click operation and then perform
|
|
||||||
action.context_click(source).perform()
|
|
||||||
#to close the browser
|
|
||||||
# driver.close()
|
|
||||||
26
december/task_081221/user_input.py
Normal file
26
december/task_081221/user_input.py
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
def main():
|
||||||
|
try:
|
||||||
|
user_input = input("Input: ")
|
||||||
|
user_input_array = user_input.split(" ")
|
||||||
|
if user_input == "all":
|
||||||
|
pages = list(map(int, range(1, 17 + 1)))
|
||||||
|
else:
|
||||||
|
for page_range in user_input_array:
|
||||||
|
if "-" in page_range:
|
||||||
|
first_num = int(page_range[:page_range.find("-")]) # gets first number
|
||||||
|
second_num = int(page_range[page_range.find("-") + 1:]) + 1 # gets second number
|
||||||
|
user_input_array = user_input_array + list(map(str, range(first_num, second_num))) # creates list with str range
|
||||||
|
pages = [elem for elem in user_input_array if not "-" in elem] # removes all elements containing "-"
|
||||||
|
pages = list(map(int, pages)) # convers str to int
|
||||||
|
pages.sort() # sorts list
|
||||||
|
pages = list(set(pages)) # removes duplicates from list
|
||||||
|
print(pages)
|
||||||
|
|
||||||
|
except:
|
||||||
|
print("Something went wrong. Try again.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
|
|
||||||
|
# 3 1 5 2 7-11 3-30
|
||||||
@ -39,3 +39,57 @@ wsproto==1.0.0
|
|||||||
xlrd==2.0.1
|
xlrd==2.0.1
|
||||||
xlwt==1.3.0
|
xlwt==1.3.0
|
||||||
yapf==0.31.0
|
yapf==0.31.0
|
||||||
|
async-generator==1.10
|
||||||
|
attrs==21.2.0
|
||||||
|
beautifulsoup4==4.10.0
|
||||||
|
bs4==0.0.1
|
||||||
|
certifi==2021.10.8
|
||||||
|
cffi==1.15.0
|
||||||
|
charset-normalizer==2.0.8
|
||||||
|
cryptography==36.0.0
|
||||||
|
cycler==0.11.0
|
||||||
|
et-xmlfile==1.1.0
|
||||||
|
filelock==3.4.0
|
||||||
|
fonttools==4.28.2
|
||||||
|
geckodriver-autoinstaller==0.1.0
|
||||||
|
h11==0.12.0
|
||||||
|
idna==3.3
|
||||||
|
jsoncodable==0.1.7
|
||||||
|
jsonpickle==2.0.0
|
||||||
|
k-selenium-cookies==0.0.4
|
||||||
|
kiwisolver==1.3.2
|
||||||
|
kproxy==0.0.1
|
||||||
|
matplotlib==3.5.0
|
||||||
|
noraise==0.0.16
|
||||||
|
numpy==1.21.4
|
||||||
|
openpyxl==3.0.9
|
||||||
|
outcome==1.1.0
|
||||||
|
packaging==21.3
|
||||||
|
pandas==1.3.4
|
||||||
|
Pillow==8.4.0
|
||||||
|
pycparser==2.21
|
||||||
|
pyOpenSSL==21.0.0
|
||||||
|
pyparsing==3.0.6
|
||||||
|
python-dateutil==2.8.2
|
||||||
|
pytz==2021.3
|
||||||
|
requests==2.26.0
|
||||||
|
requests-file==1.5.1
|
||||||
|
selenium==4.1.0
|
||||||
|
selenium-browser==0.0.15
|
||||||
|
setuptools-scm==6.3.2
|
||||||
|
six==1.16.0
|
||||||
|
sniffio==1.2.0
|
||||||
|
sortedcontainers==2.4.0
|
||||||
|
soupsieve==2.3.1
|
||||||
|
tldextract==3.1.2
|
||||||
|
tomli==1.2.2
|
||||||
|
trio==0.19.0
|
||||||
|
trio-websocket==0.9.2
|
||||||
|
urllib3==1.26.7
|
||||||
|
wget==3.2
|
||||||
|
wsproto==1.0.0
|
||||||
|
xlrd==2.0.1
|
||||||
|
XlsxWriter==3.0.2
|
||||||
|
xlwt==1.3.0
|
||||||
|
xpath-utils==0.0.3
|
||||||
|
yapf==0.31.0
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user