task_180222

This commit is contained in:
Krisotfers-Solo 2022-02-17 20:43:29 +02:00
parent 1a085b2a84
commit 43a25eb411
16 changed files with 133 additions and 22 deletions

View File

@ -7,7 +7,7 @@ from word2number import w2n
import seaborn as sns
import matplotlib
matplotlib.use('Qt5Agg')
# matplotlib.use('Qt5Agg')
import matplotlib.pyplot as plt
# mathplotlib ir bibliotēka statisku, animētu un interaktīvu vizualizāciju izveidei
@ -50,7 +50,7 @@ def graph_plot():
plt.figure(figsize=(15, 10))
sns.heatmap(data.corr())
plt.savefig("plot1.png")
plt.show()
# plt.show()
# korealācija novērojama starp kolonnām [length,width,wheel-base] un [engine-size,price,horsepower]
# noderīga ir otrā korelācija, jo tā atklāj to savstarpējo ietekmi
@ -60,15 +60,15 @@ def graph_plot():
sns.displot(data["price"])
plt.savefig("plot2.png")
plt.show()
# plt.show()
plt.scatter(data["price"], data["engine-size"])
plt.savefig("plot3.png")
plt.show()
# plt.show()
sns.scatterplot(data["price"], data["engine-size"])
plt.savefig("plot4.png")
plt.show()
# plt.show()
if __name__ == '__main__':

Binary file not shown.

Before

Width:  |  Height:  |  Size: 68 KiB

After

Width:  |  Height:  |  Size: 68 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 13 KiB

After

Width:  |  Height:  |  Size: 13 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 20 KiB

After

Width:  |  Height:  |  Size: 20 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 24 KiB

After

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 390 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 15 KiB

View File

@ -1,20 +1,92 @@
# Author - Kristiāns Francis Cagulis
# Date - 16.02.2022.
# Date - 17.02.2022.
# Title - Patstāvīgais darbs - pandas
from pathlib import Path as p
from pathlib import Path
import matplotlib
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from ss_scraper import SS
# flats_few = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/")
# flats_few.get_data()
output_path = "output/graphs"
all_df = []
def read():
pass
def read(path):
df = pd.read_excel(path)
all_df.append(df)
def address():
pass
print(list(p(p(__file__).parent.absolute()).glob("*/*.xlsx")))
def get_data():
files = list(Path(Path(__file__).parent.absolute()).glob("**/*.xlsx"))
for file in files:
read(file)
df_out = pd.concat(all_df).reset_index(drop=True)
df_out.to_excel("output/excel/combined.xlsx", index=False)
return df_out
def graph_plot():
data = get_data()
# graph_corr(data)
graph_price(data)
def graph_corr(data):
data_corr = data.copy()
sns.set_style("whitegrid")
# plt.figure(figsize=(15, 10))
sns.heatmap(data_corr.corr())
plt.savefig(f"{output_path}/korelacija.png")
def graph_price(data):
# plot settings
plt.figure(figsize=(50, 30))
plt.rc("font", size=15)
# plt.rc("font", titlesize=24)
# placing the plots in the plane
plot1 = plt.subplot2grid((3, 2), (0, 0))
plot2 = plt.subplot2grid((3, 2), (0, 1))
plot3 = plt.subplot2grid((3, 2), (1, 0))
plot4 = plt.subplot2grid((3, 2), (1, 1))
plot5 = plt.subplot2grid((3, 2), (2, 0))
# price to floor
plot1.scatter(data["Cena"], data["Stāvs"])
plot1.set_title("Price to floor")
# price to room amount
plot2.scatter(data["Cena"], data["Istabu skaits"])
plot2.set_title("Price to room amount")
# price to quadrature
plot3.scatter(data["Cena"], data["Kvadratūra"])
plot3.set_title("Price to quadrature")
# price to series
plot4.scatter(data["Cena"], data["Sērija"])
plot4.set_title("Price to series")
# price to date
plot5.scatter(data["Cena"], data["Izvietošanas datums"])
plot5.set_title("Price to floor")
plt.savefig(f"{output_path}/cenu_grafiki.png")
def main():
graph_plot()
if __name__ == "__main__":
main()

View File

@ -1,5 +1,5 @@
# Author - Kristiāns Francis Cagulis
# Date - 07.12.2021
# Date - 17.02.2022
# Title - Patstāvīgais darbs "SS.com scraping"
from bs4 import BeautifulSoup
@ -13,16 +13,19 @@ HEADERS = {
class SS:
def __init__(self, url):
def __init__(self, url, name):
self.url = url
self.name = name
def _get_page_amount(self):
page = requests.get(self.url, headers=HEADERS)
soup = BeautifulSoup(page.content, 'html.parser')
try:
last_url = soup.find(class_='td2').findChild('a')['href']
page_amount = last_url[last_url.find("page") + 4:last_url.find(".html")]
except:
page_amount = 1
print(f"Page amount = {page_amount}")
return int(page_amount)
@ -43,15 +46,18 @@ class SS:
print(f"Page {page_number}")
# getting item data
for el in soup.find_all(id=ids):
for id in soup.find_all(id=ids):
print(f"Item {item_no}")
item_no += 1
for elem in el.find_all(class_='msga2-o pp6'):
for elem in id.find_all(class_='msga2-o pp6'):
items.append(elem.get_text())
if len(id.find_all(class_='msga2-o pp6')) == 7:
del items[-2]
# adverts url
item_url = el.findChild(class_='msg2').findChild('div').findChild('a')['href'] # gets url
item_url = id.findChild(class_='msg2').findChild('div').findChild('a')['href'] # gets url
item_url = "https://www.ss.com" + item_url
item_page = requests.get(item_url, headers=HEADERS)
item_soup = BeautifulSoup(item_page.content, 'html.parser')
@ -70,16 +76,23 @@ class SS:
chunked_items_list = [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)] # combines each 'chunk_size' elements into array
columns = ["Atrašanās vieta", "Istabu skaits", "Kvadratūra", "Stāvs", "Sērija", "Cena", "Pilns sludinājuma teksts", "Izvietošanas datums"]
df = pd.DataFrame(chunked_items_list, columns=columns)
df.to_excel(excel_writer='output/output.xlsx', index=False)
df.to_excel(excel_writer=f"output/excel/output_{self.name}.xlsx", index=False)
print("Done")
flats_many = SS("https://www.ss.com/lv/real-estate/flats/riga/all/sell/")
flats_few = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/")
flats_many = SS("https://www.ss.com/lv/real-estate/flats/riga/all/sell/", "many")
flats_few = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/", "few")
flats_aizkraukle = SS("https://www.ss.com/lv/real-estate/flats/aizkraukle-and-reg/sell/", "aizkraukle")
flats_tukums = SS("https://www.ss.com/lv/real-estate/flats/tukums-and-reg/sell/", "tukums")
flats_ogre = SS("https://www.ss.com/lv/real-estate/flats/ogre-and-reg/sell/", "ogre")
def main():
flats_few.get_data()
flats_aizkraukle.get_data()
flats_tukums.get_data()
# flats_ogre.get_data()
# flats_few.get_data()
# flats_many.get_data()
if __name__ == '__main__':

View File

@ -95,3 +95,29 @@ urllib3==1.26.8
webencodings==0.5.1
yapf==0.32.0
zipp==3.7.0
beautifulsoup4==4.10.0
bs4==0.0.1
certifi==2021.10.8
charset-normalizer==2.0.12
cycler==0.11.0
et-xmlfile==1.1.0
fonttools==4.29.1
idna==3.3
kiwisolver==1.3.2
matplotlib==3.5.1
numpy==1.22.2
openpyxl==3.0.9
packaging==21.3
pandas==1.4.1
Pillow==9.0.1
pyparsing==3.0.7
python-dateutil==2.8.2
pytz==2021.3
requests==2.27.1
scipy==1.8.0
seaborn==0.11.2
six==1.16.0
soupsieve==2.3.1
urllib3==1.26.8
word2number==1.1
yapf==0.32.0