task_180222
@ -7,7 +7,7 @@ from word2number import w2n
|
|||||||
import seaborn as sns
|
import seaborn as sns
|
||||||
import matplotlib
|
import matplotlib
|
||||||
|
|
||||||
matplotlib.use('Qt5Agg')
|
# matplotlib.use('Qt5Agg')
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
# mathplotlib ir bibliotēka statisku, animētu un interaktīvu vizualizāciju izveidei
|
# mathplotlib ir bibliotēka statisku, animētu un interaktīvu vizualizāciju izveidei
|
||||||
@ -50,7 +50,7 @@ def graph_plot():
|
|||||||
plt.figure(figsize=(15, 10))
|
plt.figure(figsize=(15, 10))
|
||||||
sns.heatmap(data.corr())
|
sns.heatmap(data.corr())
|
||||||
plt.savefig("plot1.png")
|
plt.savefig("plot1.png")
|
||||||
plt.show()
|
# plt.show()
|
||||||
|
|
||||||
# korealācija novērojama starp kolonnām [length,width,wheel-base] un [engine-size,price,horsepower]
|
# korealācija novērojama starp kolonnām [length,width,wheel-base] un [engine-size,price,horsepower]
|
||||||
# noderīga ir otrā korelācija, jo tā atklāj to savstarpējo ietekmi
|
# noderīga ir otrā korelācija, jo tā atklāj to savstarpējo ietekmi
|
||||||
@ -60,15 +60,15 @@ def graph_plot():
|
|||||||
|
|
||||||
sns.displot(data["price"])
|
sns.displot(data["price"])
|
||||||
plt.savefig("plot2.png")
|
plt.savefig("plot2.png")
|
||||||
plt.show()
|
# plt.show()
|
||||||
|
|
||||||
plt.scatter(data["price"], data["engine-size"])
|
plt.scatter(data["price"], data["engine-size"])
|
||||||
plt.savefig("plot3.png")
|
plt.savefig("plot3.png")
|
||||||
plt.show()
|
# plt.show()
|
||||||
|
|
||||||
sns.scatterplot(data["price"], data["engine-size"])
|
sns.scatterplot(data["price"], data["engine-size"])
|
||||||
plt.savefig("plot4.png")
|
plt.savefig("plot4.png")
|
||||||
plt.show()
|
# plt.show()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|||||||
|
Before Width: | Height: | Size: 68 KiB After Width: | Height: | Size: 68 KiB |
|
Before Width: | Height: | Size: 13 KiB After Width: | Height: | Size: 13 KiB |
|
Before Width: | Height: | Size: 20 KiB After Width: | Height: | Size: 20 KiB |
|
Before Width: | Height: | Size: 24 KiB After Width: | Height: | Size: 26 KiB |
BIN
february/task_180222/output/excel/combined.xlsx
Normal file
BIN
february/task_180222/output/excel/output_aizkraukle.xlsx
Normal file
BIN
february/task_180222/output/excel/output_few.xlsx
Normal file
BIN
february/task_180222/output/excel/output_ogre.xlsx
Normal file
BIN
february/task_180222/output/excel/output_tukums.xlsx
Normal file
BIN
february/task_180222/output/graphs/cenu_grafiki.png
Normal file
|
After Width: | Height: | Size: 390 KiB |
BIN
february/task_180222/output/graphs/korelacija.png
Normal file
|
After Width: | Height: | Size: 15 KiB |
@ -1,20 +1,92 @@
|
|||||||
# Author - Kristiāns Francis Cagulis
|
# Author - Kristiāns Francis Cagulis
|
||||||
# Date - 16.02.2022.
|
# Date - 17.02.2022.
|
||||||
# Title - Patstāvīgais darbs - pandas
|
# Title - Patstāvīgais darbs - pandas
|
||||||
|
|
||||||
from pathlib import Path as p
|
from pathlib import Path
|
||||||
|
import matplotlib
|
||||||
|
import pandas as pd
|
||||||
|
import seaborn as sns
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
from ss_scraper import SS
|
from ss_scraper import SS
|
||||||
|
|
||||||
# flats_few = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/")
|
# flats_few = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/")
|
||||||
# flats_few.get_data()
|
# flats_few.get_data()
|
||||||
|
output_path = "output/graphs"
|
||||||
|
all_df = []
|
||||||
|
|
||||||
|
|
||||||
def read():
|
def read(path):
|
||||||
pass
|
df = pd.read_excel(path)
|
||||||
|
all_df.append(df)
|
||||||
|
|
||||||
|
|
||||||
def address():
|
def address():
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
print(list(p(p(__file__).parent.absolute()).glob("*/*.xlsx")))
|
def get_data():
|
||||||
|
files = list(Path(Path(__file__).parent.absolute()).glob("**/*.xlsx"))
|
||||||
|
|
||||||
|
for file in files:
|
||||||
|
read(file)
|
||||||
|
df_out = pd.concat(all_df).reset_index(drop=True)
|
||||||
|
df_out.to_excel("output/excel/combined.xlsx", index=False)
|
||||||
|
return df_out
|
||||||
|
|
||||||
|
|
||||||
|
def graph_plot():
|
||||||
|
data = get_data()
|
||||||
|
# graph_corr(data)
|
||||||
|
graph_price(data)
|
||||||
|
|
||||||
|
|
||||||
|
def graph_corr(data):
|
||||||
|
data_corr = data.copy()
|
||||||
|
sns.set_style("whitegrid")
|
||||||
|
# plt.figure(figsize=(15, 10))
|
||||||
|
sns.heatmap(data_corr.corr())
|
||||||
|
plt.savefig(f"{output_path}/korelacija.png")
|
||||||
|
|
||||||
|
|
||||||
|
def graph_price(data):
|
||||||
|
# plot settings
|
||||||
|
plt.figure(figsize=(50, 30))
|
||||||
|
plt.rc("font", size=15)
|
||||||
|
# plt.rc("font", titlesize=24)
|
||||||
|
|
||||||
|
# placing the plots in the plane
|
||||||
|
plot1 = plt.subplot2grid((3, 2), (0, 0))
|
||||||
|
plot2 = plt.subplot2grid((3, 2), (0, 1))
|
||||||
|
plot3 = plt.subplot2grid((3, 2), (1, 0))
|
||||||
|
plot4 = plt.subplot2grid((3, 2), (1, 1))
|
||||||
|
plot5 = plt.subplot2grid((3, 2), (2, 0))
|
||||||
|
|
||||||
|
# price to floor
|
||||||
|
plot1.scatter(data["Cena"], data["Stāvs"])
|
||||||
|
plot1.set_title("Price to floor")
|
||||||
|
|
||||||
|
# price to room amount
|
||||||
|
plot2.scatter(data["Cena"], data["Istabu skaits"])
|
||||||
|
plot2.set_title("Price to room amount")
|
||||||
|
|
||||||
|
# price to quadrature
|
||||||
|
plot3.scatter(data["Cena"], data["Kvadratūra"])
|
||||||
|
plot3.set_title("Price to quadrature")
|
||||||
|
|
||||||
|
# price to series
|
||||||
|
plot4.scatter(data["Cena"], data["Sērija"])
|
||||||
|
plot4.set_title("Price to series")
|
||||||
|
|
||||||
|
# price to date
|
||||||
|
plot5.scatter(data["Cena"], data["Izvietošanas datums"])
|
||||||
|
plot5.set_title("Price to floor")
|
||||||
|
|
||||||
|
plt.savefig(f"{output_path}/cenu_grafiki.png")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
graph_plot()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@ -1,5 +1,5 @@
|
|||||||
# Author - Kristiāns Francis Cagulis
|
# Author - Kristiāns Francis Cagulis
|
||||||
# Date - 07.12.2021
|
# Date - 17.02.2022
|
||||||
# Title - Patstāvīgais darbs "SS.com scraping"
|
# Title - Patstāvīgais darbs "SS.com scraping"
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
@ -13,16 +13,19 @@ HEADERS = {
|
|||||||
|
|
||||||
|
|
||||||
class SS:
|
class SS:
|
||||||
|
def __init__(self, url, name):
|
||||||
def __init__(self, url):
|
|
||||||
self.url = url
|
self.url = url
|
||||||
|
self.name = name
|
||||||
|
|
||||||
def _get_page_amount(self):
|
def _get_page_amount(self):
|
||||||
page = requests.get(self.url, headers=HEADERS)
|
page = requests.get(self.url, headers=HEADERS)
|
||||||
soup = BeautifulSoup(page.content, 'html.parser')
|
soup = BeautifulSoup(page.content, 'html.parser')
|
||||||
|
|
||||||
|
try:
|
||||||
last_url = soup.find(class_='td2').findChild('a')['href']
|
last_url = soup.find(class_='td2').findChild('a')['href']
|
||||||
page_amount = last_url[last_url.find("page") + 4:last_url.find(".html")]
|
page_amount = last_url[last_url.find("page") + 4:last_url.find(".html")]
|
||||||
|
except:
|
||||||
|
page_amount = 1
|
||||||
print(f"Page amount = {page_amount}")
|
print(f"Page amount = {page_amount}")
|
||||||
|
|
||||||
return int(page_amount)
|
return int(page_amount)
|
||||||
@ -43,15 +46,18 @@ class SS:
|
|||||||
print(f"Page {page_number}")
|
print(f"Page {page_number}")
|
||||||
|
|
||||||
# getting item data
|
# getting item data
|
||||||
for el in soup.find_all(id=ids):
|
for id in soup.find_all(id=ids):
|
||||||
print(f"Item {item_no}")
|
print(f"Item {item_no}")
|
||||||
item_no += 1
|
item_no += 1
|
||||||
|
|
||||||
for elem in el.find_all(class_='msga2-o pp6'):
|
for elem in id.find_all(class_='msga2-o pp6'):
|
||||||
items.append(elem.get_text())
|
items.append(elem.get_text())
|
||||||
|
|
||||||
|
if len(id.find_all(class_='msga2-o pp6')) == 7:
|
||||||
|
del items[-2]
|
||||||
|
|
||||||
# adverts url
|
# adverts url
|
||||||
item_url = el.findChild(class_='msg2').findChild('div').findChild('a')['href'] # gets url
|
item_url = id.findChild(class_='msg2').findChild('div').findChild('a')['href'] # gets url
|
||||||
item_url = "https://www.ss.com" + item_url
|
item_url = "https://www.ss.com" + item_url
|
||||||
item_page = requests.get(item_url, headers=HEADERS)
|
item_page = requests.get(item_url, headers=HEADERS)
|
||||||
item_soup = BeautifulSoup(item_page.content, 'html.parser')
|
item_soup = BeautifulSoup(item_page.content, 'html.parser')
|
||||||
@ -70,16 +76,23 @@ class SS:
|
|||||||
chunked_items_list = [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)] # combines each 'chunk_size' elements into array
|
chunked_items_list = [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)] # combines each 'chunk_size' elements into array
|
||||||
columns = ["Atrašanās vieta", "Istabu skaits", "Kvadratūra", "Stāvs", "Sērija", "Cena", "Pilns sludinājuma teksts", "Izvietošanas datums"]
|
columns = ["Atrašanās vieta", "Istabu skaits", "Kvadratūra", "Stāvs", "Sērija", "Cena", "Pilns sludinājuma teksts", "Izvietošanas datums"]
|
||||||
df = pd.DataFrame(chunked_items_list, columns=columns)
|
df = pd.DataFrame(chunked_items_list, columns=columns)
|
||||||
df.to_excel(excel_writer='output/output.xlsx', index=False)
|
df.to_excel(excel_writer=f"output/excel/output_{self.name}.xlsx", index=False)
|
||||||
print("Done")
|
print("Done")
|
||||||
|
|
||||||
|
|
||||||
flats_many = SS("https://www.ss.com/lv/real-estate/flats/riga/all/sell/")
|
flats_many = SS("https://www.ss.com/lv/real-estate/flats/riga/all/sell/", "many")
|
||||||
flats_few = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/")
|
flats_few = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/", "few")
|
||||||
|
flats_aizkraukle = SS("https://www.ss.com/lv/real-estate/flats/aizkraukle-and-reg/sell/", "aizkraukle")
|
||||||
|
flats_tukums = SS("https://www.ss.com/lv/real-estate/flats/tukums-and-reg/sell/", "tukums")
|
||||||
|
flats_ogre = SS("https://www.ss.com/lv/real-estate/flats/ogre-and-reg/sell/", "ogre")
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
flats_few.get_data()
|
flats_aizkraukle.get_data()
|
||||||
|
flats_tukums.get_data()
|
||||||
|
# flats_ogre.get_data()
|
||||||
|
# flats_few.get_data()
|
||||||
|
# flats_many.get_data()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|||||||
@ -95,3 +95,29 @@ urllib3==1.26.8
|
|||||||
webencodings==0.5.1
|
webencodings==0.5.1
|
||||||
yapf==0.32.0
|
yapf==0.32.0
|
||||||
zipp==3.7.0
|
zipp==3.7.0
|
||||||
|
beautifulsoup4==4.10.0
|
||||||
|
bs4==0.0.1
|
||||||
|
certifi==2021.10.8
|
||||||
|
charset-normalizer==2.0.12
|
||||||
|
cycler==0.11.0
|
||||||
|
et-xmlfile==1.1.0
|
||||||
|
fonttools==4.29.1
|
||||||
|
idna==3.3
|
||||||
|
kiwisolver==1.3.2
|
||||||
|
matplotlib==3.5.1
|
||||||
|
numpy==1.22.2
|
||||||
|
openpyxl==3.0.9
|
||||||
|
packaging==21.3
|
||||||
|
pandas==1.4.1
|
||||||
|
Pillow==9.0.1
|
||||||
|
pyparsing==3.0.7
|
||||||
|
python-dateutil==2.8.2
|
||||||
|
pytz==2021.3
|
||||||
|
requests==2.27.1
|
||||||
|
scipy==1.8.0
|
||||||
|
seaborn==0.11.2
|
||||||
|
six==1.16.0
|
||||||
|
soupsieve==2.3.1
|
||||||
|
urllib3==1.26.8
|
||||||
|
word2number==1.1
|
||||||
|
yapf==0.32.0
|
||||||
|
|||||||