Update task_011221_homework

2025-10-21 20:10:38 +00:00 · 2021-12-02 08:04:48 +02:00 · 2021-12-02 08:04:48 +02:00 · e10100c1ba
commit e10100c1ba
parent 2eb3f800d6
3 changed files with 162 additions and 1 deletions
--- a/december/task_011221/task_011221_classwork.py
+++ b/december/task_011221/task_011221_classwork.py
@ -1,2 +1,44 @@
 # Author - Kristiāns Francis Cagulis
-# Date - 01.12.2021
+# Date - 01.12.2021
+# Title - Stundas kopdarbs
+from bs4 import BeautifulSoup
+import requests
+
+url = "https://www.ikea.lv/"
+all_page = requests.get(url)
+
+if all_page.status_code == 200:
+	page = BeautifulSoup(all_page.content, 'html.parser')
+	found = page.find_all(class_="itemBlock")
+
+	info = []
+	item_array = []
+	for item in found:
+		item = item.findChild("div").findChild(class_="card-body")
+
+		item_name = item.findChild(class_="itemName")
+		item_name = item_name.findChild("div").findChild("h6")
+
+		item_array.append(item_name.string)
+
+		price = item.findChild(class_="itemPrice-wrapper")
+		price = price.findChild("p").findChild("span")
+
+		try:
+			item_array.append(price.attrs["data-price"])
+		except:
+			item_array.append(price.attrs["data-pricefamily"])
+
+		all_facts = []
+		for facts in all_facts:
+			if len(facts) == 1:
+				all_facts.append(facts.string)
+			else:
+				atrasts = facts.findChildren("span")
+				for i in atrasts:
+					all_facts.append(i.string)
+
+		item_array.append(all_facts)
+		info.append(item_array)
+	for ieraksts in info:
+		print(ieraksts)
--- a/december/task_011221/task_011221_homework.py
+++ b/december/task_011221/task_011221_homework.py
@ -0,0 +1,98 @@
+# Author - Kristiāns Francis Cagulis
+# Date - 01.12.2021
+# Title - Patstāvīgais darbs
+
+from bs4 import BeautifulSoup
+import requests
+import pandas as pd
+import openpyxl
+
+HEADERS = {
+    "User-Agent":
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36 Vivaldi/4.1.2369.21'
+}
+
+
+class SS:
+	def __init__(self, url):
+		self.url = url
+
+	def _get_page_amount(self):
+		current_page = None
+		page_amount = 1
+		url = self.url
+		while current_page != page_amount:
+			current_page = page_amount
+			page = requests.get(url, headers=HEADERS)
+			soup = BeautifulSoup(page.content, 'html.parser')
+
+			# getting max page amount
+			for el in soup.find_all(class_='navi'):
+				cropped_number = el.get_text().strip()
+				if cropped_number.isnumeric():
+					cropped_number = int(cropped_number)
+					if cropped_number > page_amount:
+						page_amount = cropped_number
+			url = self.url + f"/page{page_amount}.html"
+
+		return page_amount
+
+	def get_data(self):
+		items = []
+		test = []
+		combined_list = []
+		combined_list.clear()
+		# combined_list.clear()
+		for page_number in range(1, self._get_page_amount() + 1):
+			url = self.url + f"/page{page_number}.html"
+
+			page = requests.get(url, headers=HEADERS)
+			soup = BeautifulSoup(page.content, 'html.parser')
+			ids = [tag['id'] for tag in soup.select('tr[id]')]  # creates list with ids
+			ids = [x for x in ids if "tr_bnr" not in x]  # removes "tr_bnr" from list
+			ids.pop(0)  # removes first "head_line" id
+			# TODO
+			# Atrašānās vieta
+			# stāvs
+			# istabu skaits
+			# kvadratūra
+			# cena
+			# sērija
+			# Pilns sludinājuma teksts
+			# Sludinājuma ievietošanas datums
+
+			# getting product name
+			for el in soup.find_all(id=ids):
+				items.clear()
+				for elem in el.find_all(class_='msga2-o pp6'):
+					item = elem.get_text()
+					items.append(item)
+				print(items)
+				combined_list.append(items)
+			# print(combined_list)
+
+		columns = [
+		    "Atrašanās vieta",
+		    "Istabu skaits",
+		    "Kvadratūra",
+		    "Stāvs",
+		    "Sērija",
+		    "Cena",
+		    #"Pilns sludinājuma teksts",
+		    #"Izvietošanas datums"
+		]
+
+		# df = pd.DataFrame(combined_list)
+		# df.to_excel(excel_writer='test.xlsx', index=False)
+		# print(df)
+
+
+flats = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell")
+
+
+def main():
+	flats.get_data()
+
+
+if __name__ == '__main__':
+	main()
--- a/december/task_011221/test.py
+++ b/december/task_011221/test.py
@ -0,0 +1,21 @@
+from bs4 import BeautifulSoup
+import requests
+
+HEADERS = {
+    "User-Agent":
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36 Vivaldi/4.1.2369.21'
+}
+
+url = "https://www.ss.com/lv/real-estate/flats/riga/all/sell/page61.html"
+
+page = requests.get(url, headers=HEADERS)
+soup = BeautifulSoup(page.content, 'html.parser')
+
+# print(soup.find_all(class_="navi"))
+
+ids = [tag['id'] for tag in soup.select('tr[id]')]
+
+ids.pop(0)
+ids = [x for x in ids if "tr_bnr" not in x]
+
+print(ids)