mirror of
https://github.com/kristoferssolo/School.git
synced 2025-10-21 20:10:38 +00:00
Finished task_071221 SS.com scraper
This commit is contained in:
parent
344f36ecd8
commit
7aeec062ed
BIN
IKEA_scraper/ikea_scraper.zip
Normal file
BIN
IKEA_scraper/ikea_scraper.zip
Normal file
Binary file not shown.
BIN
december/task_011221/output.xlsx
Normal file
BIN
december/task_011221/output.xlsx
Normal file
Binary file not shown.
@ -1,90 +0,0 @@
|
|||||||
# Author - Kristiāns Francis Cagulis
|
|
||||||
# Date - 01.12.2021
|
|
||||||
# Title - Patstāvīgais darbs
|
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
import requests
|
|
||||||
import pandas as pd
|
|
||||||
import openpyxl
|
|
||||||
|
|
||||||
HEADERS = {
|
|
||||||
"User-Agent":
|
|
||||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36 Vivaldi/4.1.2369.21'
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class SS:
|
|
||||||
def __init__(self, url):
|
|
||||||
self.url = url
|
|
||||||
|
|
||||||
def _get_page_amount(self):
|
|
||||||
current_page = None
|
|
||||||
page_amount = 1
|
|
||||||
url = self.url
|
|
||||||
while current_page != page_amount:
|
|
||||||
current_page = page_amount
|
|
||||||
page = requests.get(url, headers=HEADERS)
|
|
||||||
soup = BeautifulSoup(page.content, 'html.parser')
|
|
||||||
|
|
||||||
# getting max page amount
|
|
||||||
for el in soup.find_all(class_='navi'):
|
|
||||||
cropped_number = el.get_text().strip()
|
|
||||||
if cropped_number.isnumeric():
|
|
||||||
cropped_number = int(cropped_number)
|
|
||||||
if cropped_number > page_amount:
|
|
||||||
page_amount = cropped_number
|
|
||||||
url = self.url + f"/page{page_amount}.html"
|
|
||||||
|
|
||||||
return page_amount
|
|
||||||
|
|
||||||
def get_data(self):
|
|
||||||
items = []
|
|
||||||
combined_list = []
|
|
||||||
# combined_list.clear()
|
|
||||||
for page_number in range(1, self._get_page_amount() + 1):
|
|
||||||
url = self.url + f"/page{page_number}.html"
|
|
||||||
|
|
||||||
page = requests.get(url, headers=HEADERS)
|
|
||||||
soup = BeautifulSoup(page.content, 'html.parser')
|
|
||||||
ids = [tag['id'] for tag in soup.select('tr[id]')] # creates list with ids
|
|
||||||
ids = [x for x in ids if "tr_bnr" not in x] # removes "tr_bnr" from list
|
|
||||||
ids.remove("head_line") # removes first "head_line" id
|
|
||||||
|
|
||||||
# TODO
|
|
||||||
# Pilns sludinājuma teksts
|
|
||||||
# Sludinājuma ievietošanas datums
|
|
||||||
|
|
||||||
# getting product name
|
|
||||||
for el in soup.find_all(id=ids):
|
|
||||||
for elem in el.find_all(class_='msga2-o pp6'):
|
|
||||||
items.append(elem.get_text())
|
|
||||||
|
|
||||||
chunk_size = 6
|
|
||||||
chunked_items_list = [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)]
|
|
||||||
|
|
||||||
columns = [
|
|
||||||
"Atrašanās vieta",
|
|
||||||
"Istabu skaits",
|
|
||||||
"Kvadratūra",
|
|
||||||
"Stāvs",
|
|
||||||
"Sērija",
|
|
||||||
"Cena",
|
|
||||||
#"Pilns sludinājuma teksts",
|
|
||||||
#"Izvietošanas datums"
|
|
||||||
]
|
|
||||||
|
|
||||||
df = pd.DataFrame(chunked_items_list, columns=columns)
|
|
||||||
df.to_excel(excel_writer='test.xlsx', index=False)
|
|
||||||
print(df)
|
|
||||||
|
|
||||||
|
|
||||||
flats = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell")
|
|
||||||
flats2 = SS("https://www.ss.com/lv/real-estate/flats/riga/all/sell/")
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
flats.get_data()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
||||||
125
december/task_011221/task_011221_ss_scraper.py
Normal file
125
december/task_011221/task_011221_ss_scraper.py
Normal file
@ -0,0 +1,125 @@
|
|||||||
|
# Author - Kristiāns Francis Cagulis
|
||||||
|
# Date - 07.12.2021
|
||||||
|
# Title - Patstāvīgais darbs "SS.com scraping"
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import requests
|
||||||
|
import pandas as pd
|
||||||
|
from PIL import Image
|
||||||
|
from io import BytesIO
|
||||||
|
from openpyxl.styles import Font, Alignment
|
||||||
|
import openpyxl
|
||||||
|
|
||||||
|
HEADERS = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36 Vivaldi/4.1.2369.21'}
|
||||||
|
|
||||||
|
|
||||||
|
class SS:
|
||||||
|
def __init__(self, url):
|
||||||
|
self.url = url
|
||||||
|
|
||||||
|
def _get_page_amount(self):
|
||||||
|
current_page = None
|
||||||
|
page_amount = 1
|
||||||
|
url = self.url
|
||||||
|
while current_page != page_amount:
|
||||||
|
current_page = page_amount
|
||||||
|
page = requests.get(url, headers=HEADERS)
|
||||||
|
soup = BeautifulSoup(page.content, 'html.parser')
|
||||||
|
|
||||||
|
# getting max page amount
|
||||||
|
for el in soup.find_all(class_='navi'):
|
||||||
|
cropped_number = el.get_text().strip()
|
||||||
|
if cropped_number.isnumeric():
|
||||||
|
cropped_number = int(cropped_number)
|
||||||
|
if cropped_number > page_amount:
|
||||||
|
page_amount = cropped_number
|
||||||
|
url = self.url + f"/page{page_amount}.html"
|
||||||
|
print(f"Page amount = {page_amount}")
|
||||||
|
return page_amount
|
||||||
|
|
||||||
|
def get_data(self):
|
||||||
|
items = []
|
||||||
|
images = []
|
||||||
|
item_no = 1
|
||||||
|
for page_number in range(1, self._get_page_amount() + 1):
|
||||||
|
url = self.url + f"/page{page_number}.html"
|
||||||
|
|
||||||
|
page = requests.get(url, headers=HEADERS)
|
||||||
|
soup = BeautifulSoup(page.content, 'html.parser')
|
||||||
|
|
||||||
|
# item ids
|
||||||
|
ids = [tag['id'] for tag in soup.select('tr[id]')] # creates list with ids
|
||||||
|
ids = [x for x in ids if "tr_bnr" not in x] # removes "tr_bnr" elements from list
|
||||||
|
ids.remove("head_line") # removes first "head_line" id
|
||||||
|
print(f"Page {page_number}")
|
||||||
|
|
||||||
|
# getting item data
|
||||||
|
for el in soup.find_all(id=ids):
|
||||||
|
print(f"Item {item_no}")
|
||||||
|
item_no += 1
|
||||||
|
|
||||||
|
# image
|
||||||
|
image_url = el.find(class_='msga2').find_next_sibling().findChild('a').findChild('img')['src'] # gets image url
|
||||||
|
response = requests.get(image_url)
|
||||||
|
img = Image.open(BytesIO(response.content))
|
||||||
|
images.append(img)
|
||||||
|
|
||||||
|
for elem in el.find_all(class_='msga2-o pp6'):
|
||||||
|
items.append(elem.get_text())
|
||||||
|
|
||||||
|
# adverts url
|
||||||
|
item_url = el.findChild(class_='msg2').findChild('div').findChild('a')['href'] # gets url
|
||||||
|
item_url = "https://www.ss.com" + item_url
|
||||||
|
item_page = requests.get(item_url, headers=HEADERS)
|
||||||
|
item_soup = BeautifulSoup(item_page.content, 'html.parser')
|
||||||
|
|
||||||
|
# adverts full text
|
||||||
|
item_text = item_soup.find(id='msg_div_msg').get_text() # gets full text
|
||||||
|
item_text = item_text[:item_text.find("Pilsēta:")] # removes text last part (table)
|
||||||
|
items.append(item_text)
|
||||||
|
|
||||||
|
# adverts publication date
|
||||||
|
item_date = item_soup.find_all('td', class_='msg_footer') # gets all 'msg_footer' class'
|
||||||
|
item_date = item_date[2].get_text() # extracts 3rd element
|
||||||
|
items.append(item_date[8:18]) # crops date
|
||||||
|
|
||||||
|
chunk_size = 8
|
||||||
|
chunked_items_list = [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)] # combines each 'chunk_size' elements into array
|
||||||
|
columns = ["Atrašanās vieta", "Istabu skaits", "Kvadratūra", "Stāvs", "Sērija", "Cena", "Pilns sludinājuma teksts", "Izvietošanas datums"]
|
||||||
|
df = pd.DataFrame(chunked_items_list, columns=columns)
|
||||||
|
df.to_excel(excel_writer='output.xlsx', index=False)
|
||||||
|
|
||||||
|
wb = openpyxl.load_workbook("output.xlsx")
|
||||||
|
ws = wb.worksheets[0]
|
||||||
|
sheet = wb.active
|
||||||
|
|
||||||
|
# 'I1' cell setup
|
||||||
|
ws['I1'] = "Attēli"
|
||||||
|
ws['I1'].font = Font(bold=True)
|
||||||
|
ws["I1"].alignment = Alignment(horizontal='center', vertical='top')
|
||||||
|
|
||||||
|
# sets cell width
|
||||||
|
sheet.column_dimensions['A'].width = 20
|
||||||
|
sheet.column_dimensions['G'].width = 50
|
||||||
|
sheet.column_dimensions['H'].width = 20
|
||||||
|
sheet.column_dimensions['I'].width = 13
|
||||||
|
|
||||||
|
for i in range(len(images)):
|
||||||
|
sheet.row_dimensions[i + 2].height = 51 # sets cell height
|
||||||
|
ws[f'G{i + 2}'].alignment = Alignment(wrap_text=True) # enables word wrap
|
||||||
|
|
||||||
|
img = openpyxl.drawing.image.Image(images[i])
|
||||||
|
ws.add_image(img, f"I{i + 2}") # adds images
|
||||||
|
wb.save("output.xlsx")
|
||||||
|
print("Done")
|
||||||
|
|
||||||
|
|
||||||
|
flats = SS("https://www.ss.com/lv/real-estate/flats/riga/all/sell/")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
flats.get_data()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
@ -1,21 +0,0 @@
|
|||||||
from bs4 import BeautifulSoup
|
|
||||||
import requests
|
|
||||||
|
|
||||||
HEADERS = {
|
|
||||||
"User-Agent":
|
|
||||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36 Vivaldi/4.1.2369.21'
|
|
||||||
}
|
|
||||||
|
|
||||||
url = "https://www.ss.com/lv/real-estate/flats/riga/all/sell/page61.html"
|
|
||||||
|
|
||||||
page = requests.get(url, headers=HEADERS)
|
|
||||||
soup = BeautifulSoup(page.content, 'html.parser')
|
|
||||||
|
|
||||||
# print(soup.find_all(class_="navi"))
|
|
||||||
|
|
||||||
ids = [tag['id'] for tag in soup.select('tr[id]')]
|
|
||||||
|
|
||||||
ids.pop(0)
|
|
||||||
ids = [x for x in ids if "tr_bnr" not in x]
|
|
||||||
|
|
||||||
print(ids)
|
|
||||||
Binary file not shown.
14579
november/task_061021/book.txt
Normal file
14579
november/task_061021/book.txt
Normal file
File diff suppressed because it is too large
Load Diff
44
november/task_061021/kcagulis_061021.py
Normal file
44
november/task_061021/kcagulis_061021.py
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
# Author - Kristiāns Francis Cagulis
|
||||||
|
# Date - 06.10.2021
|
||||||
|
import re
|
||||||
|
|
||||||
|
CHAPTERS = 61
|
||||||
|
|
||||||
|
|
||||||
|
# creates file with chapters and row numbers
|
||||||
|
def read_array(document):
|
||||||
|
with open(document, "r", encoding='utf-8') as book:
|
||||||
|
lines = [line.strip('\n') for line in book] # removes 'enter' characters
|
||||||
|
with open('array_output.txt', 'w') as output:
|
||||||
|
for i in range(1, CHAPTERS + 1):
|
||||||
|
line = lines.index(f"Chapter {i}") + 1 # finds all chapter indexes/lines
|
||||||
|
output.write(f"Line {line} - Chapter {i}\n") # writes line in file
|
||||||
|
|
||||||
|
|
||||||
|
# creates file with chapter positions
|
||||||
|
def read_string(document):
|
||||||
|
with open(document, "r", encoding='utf-8') as book:
|
||||||
|
lines = book.read()
|
||||||
|
with open('str_output.txt', 'w') as output:
|
||||||
|
for i in range(1, CHAPTERS + 1):
|
||||||
|
_, position = re.finditer(rf"\bChapter {i}\b", lines) # finds all chapter positions
|
||||||
|
output.write(f"Position {position.start()} - Chapter {i}\n") # writes position in file
|
||||||
|
|
||||||
|
|
||||||
|
def read_book(document):
|
||||||
|
read_array(document)
|
||||||
|
read_string(document)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
try:
|
||||||
|
read_book("book.txt")
|
||||||
|
except:
|
||||||
|
try:
|
||||||
|
read_book("1342-0.txt")
|
||||||
|
except:
|
||||||
|
read_book(input("Ievadiet faila nosaukumu: "))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
44
november/task_241121/demo_ikea.py
Normal file
44
november/task_241121/demo_ikea.py
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
# Author - Kristiāns Francis Cagulis
|
||||||
|
# Date - 01.12.2021
|
||||||
|
# Title - Stundas kopdarbs
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import requests
|
||||||
|
|
||||||
|
url = "https://www.ikea.lv/"
|
||||||
|
all_page = requests.get(url)
|
||||||
|
|
||||||
|
if all_page.status_code == 200:
|
||||||
|
page = BeautifulSoup(all_page.content, 'html.parser')
|
||||||
|
found = page.find_all(class_="itemBlock")
|
||||||
|
|
||||||
|
info = []
|
||||||
|
item_array = []
|
||||||
|
for item in found:
|
||||||
|
item = item.findChild("div").findChild(class_="card-body")
|
||||||
|
|
||||||
|
item_name = item.findChild(class_="itemName")
|
||||||
|
item_name = item_name.findChild("div").findChild("h6")
|
||||||
|
|
||||||
|
item_array.append(item_name.string)
|
||||||
|
|
||||||
|
price = item.findChild(class_="itemPrice-wrapper")
|
||||||
|
price = price.findChild("p").findChild("span")
|
||||||
|
|
||||||
|
try:
|
||||||
|
item_array.append(price.attrs["data-price"])
|
||||||
|
except:
|
||||||
|
item_array.append(price.attrs["data-pricefamily"])
|
||||||
|
|
||||||
|
all_facts = []
|
||||||
|
for facts in all_facts:
|
||||||
|
if len(facts) == 1:
|
||||||
|
all_facts.append(facts.string)
|
||||||
|
else:
|
||||||
|
atrasts = facts.findChildren("span")
|
||||||
|
for i in atrasts:
|
||||||
|
all_facts.append(i.string)
|
||||||
|
|
||||||
|
item_array.append(all_facts)
|
||||||
|
info.append(item_array)
|
||||||
|
for ieraksts in info:
|
||||||
|
print(ieraksts)
|
||||||
Loading…
Reference in New Issue
Block a user