task_180222

2025-10-21 20:10:38 +00:00 · 2022-02-16 17:29:34 +02:00 · 2022-02-16 17:29:34 +02:00 · 1a085b2a84
commit 1a085b2a84
parent 0bba2c52a2
3 changed files with 106 additions and 0 deletions
--- a/february/task_180222/output/output.xlsx
+++ b/february/task_180222/output/output.xlsx
--- a/february/task_180222/pd_pandas_k_f_cagulis.py
+++ b/february/task_180222/pd_pandas_k_f_cagulis.py
@ -0,0 +1,20 @@
+# Author - Kristiāns Francis Cagulis
+# Date - 16.02.2022.
+# Title - Patstāvīgais darbs - pandas
+
+from pathlib import Path as p
+from ss_scraper import SS
+
+# flats_few = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/")
+# flats_few.get_data()
+
+
+def read():
+	pass
+
+
+def address():
+	pass
+
+
+print(list(p(p(__file__).parent.absolute()).glob("*/*.xlsx")))
--- a/february/task_180222/ss_scraper.py
+++ b/february/task_180222/ss_scraper.py
@ -0,0 +1,86 @@
+# Author - Kristiāns Francis Cagulis
+# Date - 07.12.2021
+# Title - Patstāvīgais darbs "SS.com scraping"
+
+from bs4 import BeautifulSoup
+import requests
+import pandas as pd
+
+HEADERS = {
+    "User-Agent":
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.97 Safari/537.36 Vivaldi/4.1.2369.21'
+}
+
+
+class SS:
+
+	def __init__(self, url):
+		self.url = url
+
+	def _get_page_amount(self):
+		page = requests.get(self.url, headers=HEADERS)
+		soup = BeautifulSoup(page.content, 'html.parser')
+
+		last_url = soup.find(class_='td2').findChild('a')['href']
+		page_amount = last_url[last_url.find("page") + 4:last_url.find(".html")]
+		print(f"Page amount = {page_amount}")
+
+		return int(page_amount)
+
+	def get_data(self):
+		items = []
+		item_no = 1
+		for page_number in range(1, self._get_page_amount() + 1):
+			url = self.url + f"/page{page_number}.html"
+
+			page = requests.get(url, headers=HEADERS)
+			soup = BeautifulSoup(page.content, 'html.parser')
+
+			# item ids
+			ids = [tag['id'] for tag in soup.select('tr[id]')]  # creates list with ids
+			ids = [x for x in ids if "tr_bnr" not in x]  # removes "tr_bnr" elements from list
+			ids.remove("head_line")  # removes first "head_line" id
+			print(f"Page {page_number}")
+
+			# getting item data
+			for el in soup.find_all(id=ids):
+				print(f"Item {item_no}")
+				item_no += 1
+
+				for elem in el.find_all(class_='msga2-o pp6'):
+					items.append(elem.get_text())
+
+				# adverts url
+				item_url = el.findChild(class_='msg2').findChild('div').findChild('a')['href']  # gets url
+				item_url = "https://www.ss.com" + item_url
+				item_page = requests.get(item_url, headers=HEADERS)
+				item_soup = BeautifulSoup(item_page.content, 'html.parser')
+
+				# adverts full text
+				item_text = item_soup.find(id='msg_div_msg').get_text()  # gets full text
+				item_text = item_text[:item_text.find("Pilsēta:")]  # removes text last part (table)
+				items.append(item_text)
+
+				# adverts publication date
+				item_date = item_soup.find_all('td', class_='msg_footer')  # gets all 'msg_footer' class'
+				item_date = item_date[2].get_text()  # extracts 3rd element
+				items.append(item_date[8:18])  # crops date
+
+		chunk_size = 8
+		chunked_items_list = [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)]  # combines each 'chunk_size' elements into array
+		columns = ["Atrašanās vieta", "Istabu skaits", "Kvadratūra", "Stāvs", "Sērija", "Cena", "Pilns sludinājuma teksts", "Izvietošanas datums"]
+		df = pd.DataFrame(chunked_items_list, columns=columns)
+		df.to_excel(excel_writer='output/output.xlsx', index=False)
+		print("Done")
+
+
+flats_many = SS("https://www.ss.com/lv/real-estate/flats/riga/all/sell/")
+flats_few = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/")
+
+
+def main():
+	flats_few.get_data()
+
+
+if __name__ == '__main__':
+	main()