task_180222

2025-10-21 20:10:38 +00:00 · 2022-02-17 20:43:29 +02:00 · 2022-02-17 20:43:29 +02:00 · 43a25eb411
commit 43a25eb411
parent 1a085b2a84
16 changed files with 133 additions and 22 deletions
--- a/february/task_040222/classwork_040222_Cagulis.py
+++ b/february/task_040222/classwork_040222_Cagulis.py
@ -7,7 +7,7 @@ from word2number import w2n
 import seaborn as sns
 import matplotlib

-matplotlib.use('Qt5Agg')
+# matplotlib.use('Qt5Agg')
 import matplotlib.pyplot as plt

 # mathplotlib ir bibliotēka statisku, animētu un interaktīvu vizualizāciju izveidei
@ -50,7 +50,7 @@ def graph_plot():
 	plt.figure(figsize=(15, 10))
 	sns.heatmap(data.corr())
 	plt.savefig("plot1.png")
-	plt.show()
+	# plt.show()

 	# korealācija novērojama starp kolonnām [length,width,wheel-base] un [engine-size,price,horsepower]
 	# noderīga ir otrā korelācija, jo tā atklāj to savstarpējo ietekmi
@ -60,15 +60,15 @@ def graph_plot():

 	sns.displot(data["price"])
 	plt.savefig("plot2.png")
-	plt.show()
+	# plt.show()

 	plt.scatter(data["price"], data["engine-size"])
 	plt.savefig("plot3.png")
-	plt.show()
+	# plt.show()

 	sns.scatterplot(data["price"], data["engine-size"])
 	plt.savefig("plot4.png")
-	plt.show()
+	# plt.show()


 if __name__ == '__main__':
--- a/february/task_040222/plot1.png
+++ b/february/task_040222/plot1.png
--- a/february/task_040222/plot2.png
+++ b/february/task_040222/plot2.png
--- a/february/task_040222/plot3.png
+++ b/february/task_040222/plot3.png
--- a/february/task_040222/plot4.png
+++ b/february/task_040222/plot4.png
--- a/february/task_180222/output/excel/combined.xlsx
+++ b/february/task_180222/output/excel/combined.xlsx
--- a/february/task_180222/output/excel/output_aizkraukle.xlsx
+++ b/february/task_180222/output/excel/output_aizkraukle.xlsx
--- a/february/task_180222/output/excel/output_few.xlsx
+++ b/february/task_180222/output/excel/output_few.xlsx
--- a/february/task_180222/output/excel/output_ogre.xlsx
+++ b/february/task_180222/output/excel/output_ogre.xlsx
--- a/february/task_180222/output/excel/output_tukums.xlsx
+++ b/february/task_180222/output/excel/output_tukums.xlsx
--- a/february/task_180222/output/graphs/cenu_grafiki.png
+++ b/february/task_180222/output/graphs/cenu_grafiki.png
--- a/february/task_180222/output/graphs/korelacija.png
+++ b/february/task_180222/output/graphs/korelacija.png
--- a/february/task_180222/output/output.xlsx
+++ b/february/task_180222/output/output.xlsx
--- a/february/task_180222/pd_pandas_k_f_cagulis.py
+++ b/february/task_180222/pd_pandas_k_f_cagulis.py
@ -1,20 +1,92 @@
 # Author - Kristiāns Francis Cagulis
-# Date - 16.02.2022.
+# Date - 17.02.2022.
 # Title - Patstāvīgais darbs - pandas

-from pathlib import Path as p
+from pathlib import Path
+import matplotlib
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
 from ss_scraper import SS

 # flats_few = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/")
 # flats_few.get_data()
+output_path = "output/graphs"
+all_df = []


-def read():
-	pass
+def read(path):
+	df = pd.read_excel(path)
+	all_df.append(df)


 def address():
 	pass


-print(list(p(p(__file__).parent.absolute()).glob("*/*.xlsx")))
+def get_data():
+	files = list(Path(Path(__file__).parent.absolute()).glob("**/*.xlsx"))
+
+	for file in files:
+		read(file)
+	df_out = pd.concat(all_df).reset_index(drop=True)
+	df_out.to_excel("output/excel/combined.xlsx", index=False)
+	return df_out
+
+
+def graph_plot():
+	data = get_data()
+	# graph_corr(data)
+	graph_price(data)
+
+
+def graph_corr(data):
+	data_corr = data.copy()
+	sns.set_style("whitegrid")
+	# plt.figure(figsize=(15, 10))
+	sns.heatmap(data_corr.corr())
+	plt.savefig(f"{output_path}/korelacija.png")
+
+
+def graph_price(data):
+	# plot settings
+	plt.figure(figsize=(50, 30))
+	plt.rc("font", size=15)
+	# plt.rc("font", titlesize=24)
+
+	# placing the plots in the plane
+	plot1 = plt.subplot2grid((3, 2), (0, 0))
+	plot2 = plt.subplot2grid((3, 2), (0, 1))
+	plot3 = plt.subplot2grid((3, 2), (1, 0))
+	plot4 = plt.subplot2grid((3, 2), (1, 1))
+	plot5 = plt.subplot2grid((3, 2), (2, 0))
+
+	# price to floor
+	plot1.scatter(data["Cena"], data["Stāvs"])
+	plot1.set_title("Price to floor")
+
+	# price to room amount
+	plot2.scatter(data["Cena"], data["Istabu skaits"])
+	plot2.set_title("Price to room amount")
+
+	# price to quadrature
+	plot3.scatter(data["Cena"], data["Kvadratūra"])
+	plot3.set_title("Price to quadrature")
+
+	# price to series
+	plot4.scatter(data["Cena"], data["Sērija"])
+	plot4.set_title("Price to series")
+
+	# price to date
+	plot5.scatter(data["Cena"], data["Izvietošanas datums"])
+	plot5.set_title("Price to floor")
+
+	plt.savefig(f"{output_path}/cenu_grafiki.png")
+
+
+def main():
+	graph_plot()
+
+
+if __name__ == "__main__":
+	main()
--- a/february/task_180222/ss_scraper.py
+++ b/february/task_180222/ss_scraper.py
@ -1,5 +1,5 @@
 # Author - Kristiāns Francis Cagulis
-# Date - 07.12.2021
+# Date - 17.02.2022
 # Title - Patstāvīgais darbs "SS.com scraping"

 from bs4 import BeautifulSoup
@ -13,16 +13,19 @@ HEADERS = {


 class SS:
-
-	def __init__(self, url):
+	def __init__(self, url, name):
 		self.url = url
+		self.name = name

 	def _get_page_amount(self):
 		page = requests.get(self.url, headers=HEADERS)
 		soup = BeautifulSoup(page.content, 'html.parser')

-		last_url = soup.find(class_='td2').findChild('a')['href']
-		page_amount = last_url[last_url.find("page") + 4:last_url.find(".html")]
+		try:
+			last_url = soup.find(class_='td2').findChild('a')['href']
+			page_amount = last_url[last_url.find("page") + 4:last_url.find(".html")]
+		except:
+			page_amount = 1
 		print(f"Page amount = {page_amount}")

 		return int(page_amount)
@ -43,15 +46,18 @@ class SS:
 			print(f"Page {page_number}")

 			# getting item data
-			for el in soup.find_all(id=ids):
+			for id in soup.find_all(id=ids):
 				print(f"Item {item_no}")
 				item_no += 1

-				for elem in el.find_all(class_='msga2-o pp6'):
+				for elem in id.find_all(class_='msga2-o pp6'):
 					items.append(elem.get_text())

+				if len(id.find_all(class_='msga2-o pp6')) == 7:
+					del items[-2]
+
 				# adverts url
-				item_url = el.findChild(class_='msg2').findChild('div').findChild('a')['href']  # gets url
+				item_url = id.findChild(class_='msg2').findChild('div').findChild('a')['href']  # gets url
 				item_url = "https://www.ss.com" + item_url
 				item_page = requests.get(item_url, headers=HEADERS)
 				item_soup = BeautifulSoup(item_page.content, 'html.parser')
@ -70,16 +76,23 @@ class SS:
 		chunked_items_list = [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)]  # combines each 'chunk_size' elements into array
 		columns = ["Atrašanās vieta", "Istabu skaits", "Kvadratūra", "Stāvs", "Sērija", "Cena", "Pilns sludinājuma teksts", "Izvietošanas datums"]
 		df = pd.DataFrame(chunked_items_list, columns=columns)
-		df.to_excel(excel_writer='output/output.xlsx', index=False)
+		df.to_excel(excel_writer=f"output/excel/output_{self.name}.xlsx", index=False)
 		print("Done")


-flats_many = SS("https://www.ss.com/lv/real-estate/flats/riga/all/sell/")
-flats_few = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/")
+flats_many = SS("https://www.ss.com/lv/real-estate/flats/riga/all/sell/", "many")
+flats_few = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/", "few")
+flats_aizkraukle = SS("https://www.ss.com/lv/real-estate/flats/aizkraukle-and-reg/sell/", "aizkraukle")
+flats_tukums = SS("https://www.ss.com/lv/real-estate/flats/tukums-and-reg/sell/", "tukums")
+flats_ogre = SS("https://www.ss.com/lv/real-estate/flats/ogre-and-reg/sell/", "ogre")


 def main():
-	flats_few.get_data()
+	flats_aizkraukle.get_data()
+	flats_tukums.get_data()
+	# flats_ogre.get_data()
+	# flats_few.get_data()
+	# flats_many.get_data()


 if __name__ == '__main__':
--- a/requirements.txt
+++ b/requirements.txt
@ -95,3 +95,29 @@ urllib3==1.26.8
 webencodings==0.5.1
 yapf==0.32.0
 zipp==3.7.0
+beautifulsoup4==4.10.0
+bs4==0.0.1
+certifi==2021.10.8
+charset-normalizer==2.0.12
+cycler==0.11.0
+et-xmlfile==1.1.0
+fonttools==4.29.1
+idna==3.3
+kiwisolver==1.3.2
+matplotlib==3.5.1
+numpy==1.22.2
+openpyxl==3.0.9
+packaging==21.3
+pandas==1.4.1
+Pillow==9.0.1
+pyparsing==3.0.7
+python-dateutil==2.8.2
+pytz==2021.3
+requests==2.27.1
+scipy==1.8.0
+seaborn==0.11.2
+six==1.16.0
+soupsieve==2.3.1
+urllib3==1.26.8
+word2number==1.1
+yapf==0.32.0