From e10100c1ba3813ee3bc245e730fcc62e22bf7186 Mon Sep 17 00:00:00 2001 From: Kristofers Solo Date: Thu, 2 Dec 2021 08:04:48 +0200 Subject: [PATCH] Update task_011221_homework --- december/task_011221/task_011221_classwork.py | 44 ++++++++- december/task_011221/task_011221_homework.py | 98 +++++++++++++++++++ december/task_011221/test.py | 21 ++++ 3 files changed, 162 insertions(+), 1 deletion(-) create mode 100644 december/task_011221/task_011221_homework.py create mode 100644 december/task_011221/test.py diff --git a/december/task_011221/task_011221_classwork.py b/december/task_011221/task_011221_classwork.py index 9baf257e..1c28b94b 100644 --- a/december/task_011221/task_011221_classwork.py +++ b/december/task_011221/task_011221_classwork.py @@ -1,2 +1,44 @@ # Author - Kristiāns Francis Cagulis -# Date - 01.12.2021 \ No newline at end of file +# Date - 01.12.2021 +# Title - Stundas kopdarbs +from bs4 import BeautifulSoup +import requests + +url = "https://www.ikea.lv/" +all_page = requests.get(url) + +if all_page.status_code == 200: + page = BeautifulSoup(all_page.content, 'html.parser') + found = page.find_all(class_="itemBlock") + + info = [] + item_array = [] + for item in found: + item = item.findChild("div").findChild(class_="card-body") + + item_name = item.findChild(class_="itemName") + item_name = item_name.findChild("div").findChild("h6") + + item_array.append(item_name.string) + + price = item.findChild(class_="itemPrice-wrapper") + price = price.findChild("p").findChild("span") + + try: + item_array.append(price.attrs["data-price"]) + except: + item_array.append(price.attrs["data-pricefamily"]) + + all_facts = [] + for facts in all_facts: + if len(facts) == 1: + all_facts.append(facts.string) + else: + atrasts = facts.findChildren("span") + for i in atrasts: + all_facts.append(i.string) + + item_array.append(all_facts) + info.append(item_array) + for ieraksts in info: + print(ieraksts) diff --git a/december/task_011221/task_011221_homework.py b/december/task_011221/task_011221_homework.py new file mode 100644 index 00000000..fb4e8e07 --- /dev/null +++ b/december/task_011221/task_011221_homework.py @@ -0,0 +1,98 @@ +# Author - Kristiāns Francis Cagulis +# Date - 01.12.2021 +# Title - Patstāvīgais darbs + +from bs4 import BeautifulSoup +import requests +import pandas as pd +import openpyxl + +HEADERS = { + "User-Agent": + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36 Vivaldi/4.1.2369.21' +} + + +class SS: + def __init__(self, url): + self.url = url + + def _get_page_amount(self): + current_page = None + page_amount = 1 + url = self.url + while current_page != page_amount: + current_page = page_amount + page = requests.get(url, headers=HEADERS) + soup = BeautifulSoup(page.content, 'html.parser') + + # getting max page amount + for el in soup.find_all(class_='navi'): + cropped_number = el.get_text().strip() + if cropped_number.isnumeric(): + cropped_number = int(cropped_number) + if cropped_number > page_amount: + page_amount = cropped_number + url = self.url + f"/page{page_amount}.html" + + return page_amount + + def get_data(self): + items = [] + test = [] + combined_list = [] + combined_list.clear() + # combined_list.clear() + for page_number in range(1, self._get_page_amount() + 1): + url = self.url + f"/page{page_number}.html" + + page = requests.get(url, headers=HEADERS) + soup = BeautifulSoup(page.content, 'html.parser') + ids = [tag['id'] for tag in soup.select('tr[id]')] # creates list with ids + ids = [x for x in ids if "tr_bnr" not in x] # removes "tr_bnr" from list + ids.pop(0) # removes first "head_line" id + # TODO + # Atrašānās vieta + # stāvs + # istabu skaits + # kvadratūra + # cena + # sērija + # Pilns sludinājuma teksts + # Sludinājuma ievietošanas datums + + # getting product name + for el in soup.find_all(id=ids): + items.clear() + for elem in el.find_all(class_='msga2-o pp6'): + item = elem.get_text() + items.append(item) + print(items) + combined_list.append(items) + # print(combined_list) + + columns = [ + "Atrašanās vieta", + "Istabu skaits", + "Kvadratūra", + "Stāvs", + "Sērija", + "Cena", + #"Pilns sludinājuma teksts", + #"Izvietošanas datums" + ] + + # df = pd.DataFrame(combined_list) + # df.to_excel(excel_writer='test.xlsx', index=False) + # print(df) + + +flats = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell") + + +def main(): + flats.get_data() + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/december/task_011221/test.py b/december/task_011221/test.py new file mode 100644 index 00000000..f86870fc --- /dev/null +++ b/december/task_011221/test.py @@ -0,0 +1,21 @@ +from bs4 import BeautifulSoup +import requests + +HEADERS = { + "User-Agent": + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36 Vivaldi/4.1.2369.21' +} + +url = "https://www.ss.com/lv/real-estate/flats/riga/all/sell/page61.html" + +page = requests.get(url, headers=HEADERS) +soup = BeautifulSoup(page.content, 'html.parser') + +# print(soup.find_all(class_="navi")) + +ids = [tag['id'] for tag in soup.select('tr[id]')] + +ids.pop(0) +ids = [x for x in ids if "tr_bnr" not in x] + +print(ids)