mirror of
https://github.com/kristoferssolo/School.git
synced 2025-10-21 20:10:38 +00:00
Update task_011221_homework
This commit is contained in:
parent
2eb3f800d6
commit
e10100c1ba
@ -1,2 +1,44 @@
|
|||||||
# Author - Kristiāns Francis Cagulis
|
# Author - Kristiāns Francis Cagulis
|
||||||
# Date - 01.12.2021
|
# Date - 01.12.2021
|
||||||
|
# Title - Stundas kopdarbs
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import requests
|
||||||
|
|
||||||
|
url = "https://www.ikea.lv/"
|
||||||
|
all_page = requests.get(url)
|
||||||
|
|
||||||
|
if all_page.status_code == 200:
|
||||||
|
page = BeautifulSoup(all_page.content, 'html.parser')
|
||||||
|
found = page.find_all(class_="itemBlock")
|
||||||
|
|
||||||
|
info = []
|
||||||
|
item_array = []
|
||||||
|
for item in found:
|
||||||
|
item = item.findChild("div").findChild(class_="card-body")
|
||||||
|
|
||||||
|
item_name = item.findChild(class_="itemName")
|
||||||
|
item_name = item_name.findChild("div").findChild("h6")
|
||||||
|
|
||||||
|
item_array.append(item_name.string)
|
||||||
|
|
||||||
|
price = item.findChild(class_="itemPrice-wrapper")
|
||||||
|
price = price.findChild("p").findChild("span")
|
||||||
|
|
||||||
|
try:
|
||||||
|
item_array.append(price.attrs["data-price"])
|
||||||
|
except:
|
||||||
|
item_array.append(price.attrs["data-pricefamily"])
|
||||||
|
|
||||||
|
all_facts = []
|
||||||
|
for facts in all_facts:
|
||||||
|
if len(facts) == 1:
|
||||||
|
all_facts.append(facts.string)
|
||||||
|
else:
|
||||||
|
atrasts = facts.findChildren("span")
|
||||||
|
for i in atrasts:
|
||||||
|
all_facts.append(i.string)
|
||||||
|
|
||||||
|
item_array.append(all_facts)
|
||||||
|
info.append(item_array)
|
||||||
|
for ieraksts in info:
|
||||||
|
print(ieraksts)
|
||||||
|
|||||||
98
december/task_011221/task_011221_homework.py
Normal file
98
december/task_011221/task_011221_homework.py
Normal file
@ -0,0 +1,98 @@
|
|||||||
|
# Author - Kristiāns Francis Cagulis
|
||||||
|
# Date - 01.12.2021
|
||||||
|
# Title - Patstāvīgais darbs
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import requests
|
||||||
|
import pandas as pd
|
||||||
|
import openpyxl
|
||||||
|
|
||||||
|
HEADERS = {
|
||||||
|
"User-Agent":
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36 Vivaldi/4.1.2369.21'
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class SS:
|
||||||
|
def __init__(self, url):
|
||||||
|
self.url = url
|
||||||
|
|
||||||
|
def _get_page_amount(self):
|
||||||
|
current_page = None
|
||||||
|
page_amount = 1
|
||||||
|
url = self.url
|
||||||
|
while current_page != page_amount:
|
||||||
|
current_page = page_amount
|
||||||
|
page = requests.get(url, headers=HEADERS)
|
||||||
|
soup = BeautifulSoup(page.content, 'html.parser')
|
||||||
|
|
||||||
|
# getting max page amount
|
||||||
|
for el in soup.find_all(class_='navi'):
|
||||||
|
cropped_number = el.get_text().strip()
|
||||||
|
if cropped_number.isnumeric():
|
||||||
|
cropped_number = int(cropped_number)
|
||||||
|
if cropped_number > page_amount:
|
||||||
|
page_amount = cropped_number
|
||||||
|
url = self.url + f"/page{page_amount}.html"
|
||||||
|
|
||||||
|
return page_amount
|
||||||
|
|
||||||
|
def get_data(self):
|
||||||
|
items = []
|
||||||
|
test = []
|
||||||
|
combined_list = []
|
||||||
|
combined_list.clear()
|
||||||
|
# combined_list.clear()
|
||||||
|
for page_number in range(1, self._get_page_amount() + 1):
|
||||||
|
url = self.url + f"/page{page_number}.html"
|
||||||
|
|
||||||
|
page = requests.get(url, headers=HEADERS)
|
||||||
|
soup = BeautifulSoup(page.content, 'html.parser')
|
||||||
|
ids = [tag['id'] for tag in soup.select('tr[id]')] # creates list with ids
|
||||||
|
ids = [x for x in ids if "tr_bnr" not in x] # removes "tr_bnr" from list
|
||||||
|
ids.pop(0) # removes first "head_line" id
|
||||||
|
# TODO
|
||||||
|
# Atrašānās vieta
|
||||||
|
# stāvs
|
||||||
|
# istabu skaits
|
||||||
|
# kvadratūra
|
||||||
|
# cena
|
||||||
|
# sērija
|
||||||
|
# Pilns sludinājuma teksts
|
||||||
|
# Sludinājuma ievietošanas datums
|
||||||
|
|
||||||
|
# getting product name
|
||||||
|
for el in soup.find_all(id=ids):
|
||||||
|
items.clear()
|
||||||
|
for elem in el.find_all(class_='msga2-o pp6'):
|
||||||
|
item = elem.get_text()
|
||||||
|
items.append(item)
|
||||||
|
print(items)
|
||||||
|
combined_list.append(items)
|
||||||
|
# print(combined_list)
|
||||||
|
|
||||||
|
columns = [
|
||||||
|
"Atrašanās vieta",
|
||||||
|
"Istabu skaits",
|
||||||
|
"Kvadratūra",
|
||||||
|
"Stāvs",
|
||||||
|
"Sērija",
|
||||||
|
"Cena",
|
||||||
|
#"Pilns sludinājuma teksts",
|
||||||
|
#"Izvietošanas datums"
|
||||||
|
]
|
||||||
|
|
||||||
|
# df = pd.DataFrame(combined_list)
|
||||||
|
# df.to_excel(excel_writer='test.xlsx', index=False)
|
||||||
|
# print(df)
|
||||||
|
|
||||||
|
|
||||||
|
flats = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
flats.get_data()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
21
december/task_011221/test.py
Normal file
21
december/task_011221/test.py
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import requests
|
||||||
|
|
||||||
|
HEADERS = {
|
||||||
|
"User-Agent":
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36 Vivaldi/4.1.2369.21'
|
||||||
|
}
|
||||||
|
|
||||||
|
url = "https://www.ss.com/lv/real-estate/flats/riga/all/sell/page61.html"
|
||||||
|
|
||||||
|
page = requests.get(url, headers=HEADERS)
|
||||||
|
soup = BeautifulSoup(page.content, 'html.parser')
|
||||||
|
|
||||||
|
# print(soup.find_all(class_="navi"))
|
||||||
|
|
||||||
|
ids = [tag['id'] for tag in soup.select('tr[id]')]
|
||||||
|
|
||||||
|
ids.pop(0)
|
||||||
|
ids = [x for x in ids if "tr_bnr" not in x]
|
||||||
|
|
||||||
|
print(ids)
|
||||||
Loading…
Reference in New Issue
Block a user