task_180222

2025-10-21 20:10:38 +00:00 · 2022-02-17 20:43:29 +02:00 · 2022-02-17 20:43:29 +02:00 · 43a25eb411
commit 43a25eb411
parent 1a085b2a84
16 changed files with 133 additions and 22 deletions
--- a/february/task_040222/classwork_040222_Cagulis.py
+++ b/february/task_040222/classwork_040222_Cagulis.py
@ -7,7 +7,7 @@ from word2number import w2n
 import seaborn as sns
 import matplotlib
-matplotlib.use('Qt5Agg')
+# matplotlib.use('Qt5Agg')
 import matplotlib.pyplot as plt
 # mathplotlib ir bibliotēka statisku, animētu un interaktīvu vizualizāciju izveidei
@ -50,7 +50,7 @@ def graph_plot():
 	plt.figure(figsize=(15, 10))
 	sns.heatmap(data.corr())
 	plt.savefig("plot1.png")
-	plt.show()
+	# plt.show()
 	# korealācija novērojama starp kolonnām [length,width,wheel-base] un [engine-size,price,horsepower]
 	# noderīga ir otrā korelācija, jo tā atklāj to savstarpējo ietekmi
@ -60,15 +60,15 @@ def graph_plot():
 	sns.displot(data["price"])
 	plt.savefig("plot2.png")
-	plt.show()
+	# plt.show()
 	plt.scatter(data["price"], data["engine-size"])
 	plt.savefig("plot3.png")
-	plt.show()
+	# plt.show()
 	sns.scatterplot(data["price"], data["engine-size"])
 	plt.savefig("plot4.png")
-	plt.show()
+	# plt.show()
 if __name__ == '__main__':
--- a/february/task_040222/plot1.png
+++ b/february/task_040222/plot1.png
--- a/february/task_040222/plot2.png
+++ b/february/task_040222/plot2.png
--- a/february/task_040222/plot3.png
+++ b/february/task_040222/plot3.png
--- a/february/task_040222/plot4.png
+++ b/february/task_040222/plot4.png
--- a/february/task_180222/output/excel/combined.xlsx
+++ b/february/task_180222/output/excel/combined.xlsx
--- a/february/task_180222/output/excel/output_aizkraukle.xlsx
+++ b/february/task_180222/output/excel/output_aizkraukle.xlsx
--- a/february/task_180222/output/excel/output_few.xlsx
+++ b/february/task_180222/output/excel/output_few.xlsx
--- a/february/task_180222/output/excel/output_ogre.xlsx
+++ b/february/task_180222/output/excel/output_ogre.xlsx
--- a/february/task_180222/output/excel/output_tukums.xlsx
+++ b/february/task_180222/output/excel/output_tukums.xlsx
--- a/february/task_180222/output/graphs/cenu_grafiki.png
+++ b/february/task_180222/output/graphs/cenu_grafiki.png
--- a/february/task_180222/output/graphs/korelacija.png
+++ b/february/task_180222/output/graphs/korelacija.png
--- a/february/task_180222/output/output.xlsx
+++ b/february/task_180222/output/output.xlsx
--- a/february/task_180222/pd_pandas_k_f_cagulis.py
+++ b/february/task_180222/pd_pandas_k_f_cagulis.py
@ -1,20 +1,92 @@
 # Author - Kristiāns Francis Cagulis
-# Date - 16.02.2022.
+# Date - 17.02.2022.
 # Title - Patstāvīgais darbs - pandas
-from pathlib import Path as p
+from pathlib import Path
 import matplotlib
 import pandas as pd
 import seaborn as sns
 import matplotlib.pyplot as plt
 from ss_scraper import SS
 # flats_few = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/")
 # flats_few.get_data()
 output_path = "output/graphs"
 all_df = []
-def read():
+def read(path):
-	pass
+	df = pd.read_excel(path)
 	all_df.append(df)
 def address():
 	pass
-print(list(p(p(__file__).parent.absolute()).glob("*/*.xlsx")))
+def get_data():
 	files = list(Path(Path(__file__).parent.absolute()).glob("**/*.xlsx"))
 	for file in files:
 		read(file)
 	df_out = pd.concat(all_df).reset_index(drop=True)
 	df_out.to_excel("output/excel/combined.xlsx", index=False)
 	return df_out
 def graph_plot():
 	data = get_data()
 	# graph_corr(data)
 	graph_price(data)
 def graph_corr(data):
 	data_corr = data.copy()
 	sns.set_style("whitegrid")
 	# plt.figure(figsize=(15, 10))
 	sns.heatmap(data_corr.corr())
 	plt.savefig(f"{output_path}/korelacija.png")
 def graph_price(data):
 	# plot settings
 	plt.figure(figsize=(50, 30))
 	plt.rc("font", size=15)
 	# plt.rc("font", titlesize=24)
 	# placing the plots in the plane
 	plot1 = plt.subplot2grid((3, 2), (0, 0))
 	plot2 = plt.subplot2grid((3, 2), (0, 1))
 	plot3 = plt.subplot2grid((3, 2), (1, 0))
 	plot4 = plt.subplot2grid((3, 2), (1, 1))
 	plot5 = plt.subplot2grid((3, 2), (2, 0))
 	# price to floor
 	plot1.scatter(data["Cena"], data["Stāvs"])
 	plot1.set_title("Price to floor")
 	# price to room amount
 	plot2.scatter(data["Cena"], data["Istabu skaits"])
 	plot2.set_title("Price to room amount")
 	# price to quadrature
 	plot3.scatter(data["Cena"], data["Kvadratūra"])
 	plot3.set_title("Price to quadrature")
 	# price to series
 	plot4.scatter(data["Cena"], data["Sērija"])
 	plot4.set_title("Price to series")
 	# price to date
 	plot5.scatter(data["Cena"], data["Izvietošanas datums"])
 	plot5.set_title("Price to floor")
 	plt.savefig(f"{output_path}/cenu_grafiki.png")
 def main():
 	graph_plot()
 if __name__ == "__main__":
 	main()
--- a/february/task_180222/ss_scraper.py
+++ b/february/task_180222/ss_scraper.py
@ -1,5 +1,5 @@
 # Author - Kristiāns Francis Cagulis
-# Date - 07.12.2021
+# Date - 17.02.2022
 # Title - Patstāvīgais darbs "SS.com scraping"
 from bs4 import BeautifulSoup
@ -13,16 +13,19 @@ HEADERS = {
 class SS:
-
+	def __init__(self, url, name):
 	def __init__(self, url):
 		self.url = url
 		self.name = name
 	def _get_page_amount(self):
 		page = requests.get(self.url, headers=HEADERS)
 		soup = BeautifulSoup(page.content, 'html.parser')
 		try:
 			last_url = soup.find(class_='td2').findChild('a')['href']
 			page_amount = last_url[last_url.find("page") + 4:last_url.find(".html")]
 		except:
 			page_amount = 1
 		print(f"Page amount = {page_amount}")
 		return int(page_amount)
@ -43,15 +46,18 @@ class SS:
 			print(f"Page {page_number}")
 			# getting item data
-			for el in soup.find_all(id=ids):
+			for id in soup.find_all(id=ids):
 				print(f"Item {item_no}")
 				item_no += 1
-				for elem in el.find_all(class_='msga2-o pp6'):
+				for elem in id.find_all(class_='msga2-o pp6'):
 					items.append(elem.get_text())
 				if len(id.find_all(class_='msga2-o pp6')) == 7:
 					del items[-2]
 				# adverts url
-				item_url = el.findChild(class_='msg2').findChild('div').findChild('a')['href']  # gets url
+				item_url = id.findChild(class_='msg2').findChild('div').findChild('a')['href']  # gets url
 				item_url = "https://www.ss.com" + item_url
 				item_page = requests.get(item_url, headers=HEADERS)
 				item_soup = BeautifulSoup(item_page.content, 'html.parser')
@ -70,16 +76,23 @@ class SS:
 		chunked_items_list = [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)]  # combines each 'chunk_size' elements into array
 		columns = ["Atrašanās vieta", "Istabu skaits", "Kvadratūra", "Stāvs", "Sērija", "Cena", "Pilns sludinājuma teksts", "Izvietošanas datums"]
 		df = pd.DataFrame(chunked_items_list, columns=columns)
-		df.to_excel(excel_writer='output/output.xlsx', index=False)
+		df.to_excel(excel_writer=f"output/excel/output_{self.name}.xlsx", index=False)
 		print("Done")
-flats_many = SS("https://www.ss.com/lv/real-estate/flats/riga/all/sell/")
+flats_many = SS("https://www.ss.com/lv/real-estate/flats/riga/all/sell/", "many")
-flats_few = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/")
+flats_few = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/", "few")
 flats_aizkraukle = SS("https://www.ss.com/lv/real-estate/flats/aizkraukle-and-reg/sell/", "aizkraukle")
 flats_tukums = SS("https://www.ss.com/lv/real-estate/flats/tukums-and-reg/sell/", "tukums")
 flats_ogre = SS("https://www.ss.com/lv/real-estate/flats/ogre-and-reg/sell/", "ogre")
 def main():
-	flats_few.get_data()
+	flats_aizkraukle.get_data()
 	flats_tukums.get_data()
 	# flats_ogre.get_data()
 	# flats_few.get_data()
 	# flats_many.get_data()
 if __name__ == '__main__':
--- a/requirements.txt
+++ b/requirements.txt
@ -95,3 +95,29 @@ urllib3==1.26.8
 webencodings==0.5.1
 yapf==0.32.0
 zipp==3.7.0
 beautifulsoup4==4.10.0
 bs4==0.0.1
 certifi==2021.10.8
 charset-normalizer==2.0.12
 cycler==0.11.0
 et-xmlfile==1.1.0
 fonttools==4.29.1
 idna==3.3
 kiwisolver==1.3.2
 matplotlib==3.5.1
 numpy==1.22.2
 openpyxl==3.0.9
 packaging==21.3
 pandas==1.4.1
 Pillow==9.0.1
 pyparsing==3.0.7
 python-dateutil==2.8.2
 pytz==2021.3
 requests==2.27.1
 scipy==1.8.0
 seaborn==0.11.2
 six==1.16.0
 soupsieve==2.3.1
 urllib3==1.26.8
 word2number==1.1
 yapf==0.32.0