task_180222
@ -7,7 +7,7 @@ from word2number import w2n
|
||||
import seaborn as sns
|
||||
import matplotlib
|
||||
|
||||
matplotlib.use('Qt5Agg')
|
||||
# matplotlib.use('Qt5Agg')
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
# mathplotlib ir bibliotēka statisku, animētu un interaktīvu vizualizāciju izveidei
|
||||
@ -50,7 +50,7 @@ def graph_plot():
|
||||
plt.figure(figsize=(15, 10))
|
||||
sns.heatmap(data.corr())
|
||||
plt.savefig("plot1.png")
|
||||
plt.show()
|
||||
# plt.show()
|
||||
|
||||
# korealācija novērojama starp kolonnām [length,width,wheel-base] un [engine-size,price,horsepower]
|
||||
# noderīga ir otrā korelācija, jo tā atklāj to savstarpējo ietekmi
|
||||
@ -60,15 +60,15 @@ def graph_plot():
|
||||
|
||||
sns.displot(data["price"])
|
||||
plt.savefig("plot2.png")
|
||||
plt.show()
|
||||
# plt.show()
|
||||
|
||||
plt.scatter(data["price"], data["engine-size"])
|
||||
plt.savefig("plot3.png")
|
||||
plt.show()
|
||||
# plt.show()
|
||||
|
||||
sns.scatterplot(data["price"], data["engine-size"])
|
||||
plt.savefig("plot4.png")
|
||||
plt.show()
|
||||
# plt.show()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
|
Before Width: | Height: | Size: 68 KiB After Width: | Height: | Size: 68 KiB |
|
Before Width: | Height: | Size: 13 KiB After Width: | Height: | Size: 13 KiB |
|
Before Width: | Height: | Size: 20 KiB After Width: | Height: | Size: 20 KiB |
|
Before Width: | Height: | Size: 24 KiB After Width: | Height: | Size: 26 KiB |
BIN
february/task_180222/output/excel/combined.xlsx
Normal file
BIN
february/task_180222/output/excel/output_aizkraukle.xlsx
Normal file
BIN
february/task_180222/output/excel/output_few.xlsx
Normal file
BIN
february/task_180222/output/excel/output_ogre.xlsx
Normal file
BIN
february/task_180222/output/excel/output_tukums.xlsx
Normal file
BIN
february/task_180222/output/graphs/cenu_grafiki.png
Normal file
|
After Width: | Height: | Size: 390 KiB |
BIN
february/task_180222/output/graphs/korelacija.png
Normal file
|
After Width: | Height: | Size: 15 KiB |
@ -1,20 +1,92 @@
|
||||
# Author - Kristiāns Francis Cagulis
|
||||
# Date - 16.02.2022.
|
||||
# Date - 17.02.2022.
|
||||
# Title - Patstāvīgais darbs - pandas
|
||||
|
||||
from pathlib import Path as p
|
||||
from pathlib import Path
|
||||
import matplotlib
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
from ss_scraper import SS
|
||||
|
||||
# flats_few = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/")
|
||||
# flats_few.get_data()
|
||||
output_path = "output/graphs"
|
||||
all_df = []
|
||||
|
||||
|
||||
def read():
|
||||
pass
|
||||
def read(path):
|
||||
df = pd.read_excel(path)
|
||||
all_df.append(df)
|
||||
|
||||
|
||||
def address():
|
||||
pass
|
||||
|
||||
|
||||
print(list(p(p(__file__).parent.absolute()).glob("*/*.xlsx")))
|
||||
def get_data():
|
||||
files = list(Path(Path(__file__).parent.absolute()).glob("**/*.xlsx"))
|
||||
|
||||
for file in files:
|
||||
read(file)
|
||||
df_out = pd.concat(all_df).reset_index(drop=True)
|
||||
df_out.to_excel("output/excel/combined.xlsx", index=False)
|
||||
return df_out
|
||||
|
||||
|
||||
def graph_plot():
|
||||
data = get_data()
|
||||
# graph_corr(data)
|
||||
graph_price(data)
|
||||
|
||||
|
||||
def graph_corr(data):
|
||||
data_corr = data.copy()
|
||||
sns.set_style("whitegrid")
|
||||
# plt.figure(figsize=(15, 10))
|
||||
sns.heatmap(data_corr.corr())
|
||||
plt.savefig(f"{output_path}/korelacija.png")
|
||||
|
||||
|
||||
def graph_price(data):
|
||||
# plot settings
|
||||
plt.figure(figsize=(50, 30))
|
||||
plt.rc("font", size=15)
|
||||
# plt.rc("font", titlesize=24)
|
||||
|
||||
# placing the plots in the plane
|
||||
plot1 = plt.subplot2grid((3, 2), (0, 0))
|
||||
plot2 = plt.subplot2grid((3, 2), (0, 1))
|
||||
plot3 = plt.subplot2grid((3, 2), (1, 0))
|
||||
plot4 = plt.subplot2grid((3, 2), (1, 1))
|
||||
plot5 = plt.subplot2grid((3, 2), (2, 0))
|
||||
|
||||
# price to floor
|
||||
plot1.scatter(data["Cena"], data["Stāvs"])
|
||||
plot1.set_title("Price to floor")
|
||||
|
||||
# price to room amount
|
||||
plot2.scatter(data["Cena"], data["Istabu skaits"])
|
||||
plot2.set_title("Price to room amount")
|
||||
|
||||
# price to quadrature
|
||||
plot3.scatter(data["Cena"], data["Kvadratūra"])
|
||||
plot3.set_title("Price to quadrature")
|
||||
|
||||
# price to series
|
||||
plot4.scatter(data["Cena"], data["Sērija"])
|
||||
plot4.set_title("Price to series")
|
||||
|
||||
# price to date
|
||||
plot5.scatter(data["Cena"], data["Izvietošanas datums"])
|
||||
plot5.set_title("Price to floor")
|
||||
|
||||
plt.savefig(f"{output_path}/cenu_grafiki.png")
|
||||
|
||||
|
||||
def main():
|
||||
graph_plot()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -1,5 +1,5 @@
|
||||
# Author - Kristiāns Francis Cagulis
|
||||
# Date - 07.12.2021
|
||||
# Date - 17.02.2022
|
||||
# Title - Patstāvīgais darbs "SS.com scraping"
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
@ -13,16 +13,19 @@ HEADERS = {
|
||||
|
||||
|
||||
class SS:
|
||||
|
||||
def __init__(self, url):
|
||||
def __init__(self, url, name):
|
||||
self.url = url
|
||||
self.name = name
|
||||
|
||||
def _get_page_amount(self):
|
||||
page = requests.get(self.url, headers=HEADERS)
|
||||
soup = BeautifulSoup(page.content, 'html.parser')
|
||||
|
||||
last_url = soup.find(class_='td2').findChild('a')['href']
|
||||
page_amount = last_url[last_url.find("page") + 4:last_url.find(".html")]
|
||||
try:
|
||||
last_url = soup.find(class_='td2').findChild('a')['href']
|
||||
page_amount = last_url[last_url.find("page") + 4:last_url.find(".html")]
|
||||
except:
|
||||
page_amount = 1
|
||||
print(f"Page amount = {page_amount}")
|
||||
|
||||
return int(page_amount)
|
||||
@ -43,15 +46,18 @@ class SS:
|
||||
print(f"Page {page_number}")
|
||||
|
||||
# getting item data
|
||||
for el in soup.find_all(id=ids):
|
||||
for id in soup.find_all(id=ids):
|
||||
print(f"Item {item_no}")
|
||||
item_no += 1
|
||||
|
||||
for elem in el.find_all(class_='msga2-o pp6'):
|
||||
for elem in id.find_all(class_='msga2-o pp6'):
|
||||
items.append(elem.get_text())
|
||||
|
||||
if len(id.find_all(class_='msga2-o pp6')) == 7:
|
||||
del items[-2]
|
||||
|
||||
# adverts url
|
||||
item_url = el.findChild(class_='msg2').findChild('div').findChild('a')['href'] # gets url
|
||||
item_url = id.findChild(class_='msg2').findChild('div').findChild('a')['href'] # gets url
|
||||
item_url = "https://www.ss.com" + item_url
|
||||
item_page = requests.get(item_url, headers=HEADERS)
|
||||
item_soup = BeautifulSoup(item_page.content, 'html.parser')
|
||||
@ -70,16 +76,23 @@ class SS:
|
||||
chunked_items_list = [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)] # combines each 'chunk_size' elements into array
|
||||
columns = ["Atrašanās vieta", "Istabu skaits", "Kvadratūra", "Stāvs", "Sērija", "Cena", "Pilns sludinājuma teksts", "Izvietošanas datums"]
|
||||
df = pd.DataFrame(chunked_items_list, columns=columns)
|
||||
df.to_excel(excel_writer='output/output.xlsx', index=False)
|
||||
df.to_excel(excel_writer=f"output/excel/output_{self.name}.xlsx", index=False)
|
||||
print("Done")
|
||||
|
||||
|
||||
flats_many = SS("https://www.ss.com/lv/real-estate/flats/riga/all/sell/")
|
||||
flats_few = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/")
|
||||
flats_many = SS("https://www.ss.com/lv/real-estate/flats/riga/all/sell/", "many")
|
||||
flats_few = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/", "few")
|
||||
flats_aizkraukle = SS("https://www.ss.com/lv/real-estate/flats/aizkraukle-and-reg/sell/", "aizkraukle")
|
||||
flats_tukums = SS("https://www.ss.com/lv/real-estate/flats/tukums-and-reg/sell/", "tukums")
|
||||
flats_ogre = SS("https://www.ss.com/lv/real-estate/flats/ogre-and-reg/sell/", "ogre")
|
||||
|
||||
|
||||
def main():
|
||||
flats_few.get_data()
|
||||
flats_aizkraukle.get_data()
|
||||
flats_tukums.get_data()
|
||||
# flats_ogre.get_data()
|
||||
# flats_few.get_data()
|
||||
# flats_many.get_data()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@ -95,3 +95,29 @@ urllib3==1.26.8
|
||||
webencodings==0.5.1
|
||||
yapf==0.32.0
|
||||
zipp==3.7.0
|
||||
beautifulsoup4==4.10.0
|
||||
bs4==0.0.1
|
||||
certifi==2021.10.8
|
||||
charset-normalizer==2.0.12
|
||||
cycler==0.11.0
|
||||
et-xmlfile==1.1.0
|
||||
fonttools==4.29.1
|
||||
idna==3.3
|
||||
kiwisolver==1.3.2
|
||||
matplotlib==3.5.1
|
||||
numpy==1.22.2
|
||||
openpyxl==3.0.9
|
||||
packaging==21.3
|
||||
pandas==1.4.1
|
||||
Pillow==9.0.1
|
||||
pyparsing==3.0.7
|
||||
python-dateutil==2.8.2
|
||||
pytz==2021.3
|
||||
requests==2.27.1
|
||||
scipy==1.8.0
|
||||
seaborn==0.11.2
|
||||
six==1.16.0
|
||||
soupsieve==2.3.1
|
||||
urllib3==1.26.8
|
||||
word2number==1.1
|
||||
yapf==0.32.0
|
||||
|
||||