task_180222

This commit is contained in:
Krisotfers-Solo 2022-02-21 01:37:28 +02:00
parent 8f9c22e1d6
commit 2489e585e5
17 changed files with 181 additions and 83 deletions

3
.gitignore vendored
View File

@ -3,4 +3,5 @@
**/.venv/ **/.venv/
**.log **.log
/december/task_081221/files /december/task_081221/files
/december/task_081221/*.log /december/task_081221/*.log
**.plk

Binary file not shown.

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 341 KiB

After

Width:  |  Height:  |  Size: 258 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 17 KiB

After

Width:  |  Height:  |  Size: 14 KiB

Binary file not shown.

View File

@ -2,10 +2,16 @@
# Date - 17.02.2022. # Date - 17.02.2022.
# Title - Patstāvīgais darbs - pandas # Title - Patstāvīgais darbs - pandas
from pathlib import Path
import pandas as pd import pandas as pd
import seaborn as sns import seaborn as sns
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import requests
from pathlib import Path
from random import randint
from fpdf import FPDF
from statistics import mode
from PIL import Image
from io import BytesIO
from ss_scraper import SS from ss_scraper import SS
# flats_few = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/") # flats_few = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/")
@ -13,105 +19,202 @@ from ss_scraper import SS
output_path = "output/graphs" output_path = "output/graphs"
all_df = [] all_df = []
QUADRATURE = "Kvadratūra"
FLOOR = "Stāvs"
PRICE = "Cena"
SERIES = "Sērija"
ROOM_AMOUNT = "Istabu skaits"
PUB_DATE = "Izvietošanas datums"
def read(path): series_photos = {
df = pd.read_excel(path) "103.": "https://i.ss.com/gallery/5/902/225301/45060087.th2.jpg",
all_df.append(df) "104.": "https://i.ss.com/gallery/5/888/221910/44381841.th2.jpg",
"119.": "https://i.ss.com/gallery/5/902/225443/45088567.th2.jpg",
"467.": "https://i.ss.com/gallery/5/892/222881/44576186.th2.jpg",
"602.": "https://i.ss.com/gallery/5/896/223820/44763891.th2.jpg",
"Čehu pr.": "https://i.ss.com/gallery/5/902/225358/45071499.th2.jpg",
"Hrušč.": "https://i.ss.com/gallery/5/896/223961/44792152.th2.jpg",
"LT proj.": "https://i.ss.com/gallery/5/873/218203/43640498.th2.jpg",
"M. ģim.": "https://i.ss.com/gallery/5/871/217506/43501012.th2.jpg",
"P. kara": "https://i.ss.com/gallery/5/902/225490/45097851.th2.jpg",
"Priv. m.": "https://i.ss.com/gallery/5/895/223697/44739240.th2.jpg",
"Renov.": "https://i.ss.com/gallery/5/902/225442/45088303.th2.jpg",
"Specpr.": "https://i.ss.com/gallery/5/902/225492/45098378.th2.jpg",
"Staļina": "https://i.ss.com/gallery/5/902/225440/45087952.th2.jpg",
"Jaun.": "https://i.ss.com/gallery/5/902/225456/45091154.th2.jpg"
}
def get_data(): class priceGraphs:
def __init__(self, data, pos, x_value, title, xlabel, y_value=PRICE, ylabel="Price"):
self.pos = pos
self.x_value = data[x_value]
self.y_value = data[y_value]
self.title = title
self.xlabel = xlabel
self.ylabel = ylabel
def _graph_price(self):
plot = plt.subplot2grid((3, 2), self.pos)
plot.scatter(self.x_value, self.y_value)
plot.set_title(self.title)
plot.set_xlabel(self.xlabel)
plot.set_ylabel(self.ylabel)
def read():
files = list(Path(Path(__file__).parent.absolute()).glob("**/*.xlsx")) files = list(Path(Path(__file__).parent.absolute()).glob("**/*.xlsx"))
for file in files: for file_path in files:
read(file) all_df.append(pd.read_excel(file_path))
df_out = pd.concat(all_df).reset_index(drop=True) df_combined = pd.concat(all_df).reset_index(drop=True)
# df_out.to_excel("output/excel/combined.xlsx", index=False) df_combined.sort_values(by=[PRICE, PUB_DATE], inplace=True)
df_combined.drop_duplicates(subset="Pilns sludinājuma teksts", keep=False, inplace=True)
# replaces floor value to intiger # replaces floor value to intiger
for value in df_out["Stāvs"]: for value in df_combined[FLOOR]:
df_out = df_out.replace(value, int(value[:value.find("/")])) df_combined = df_combined.replace(value, int(float(value[:value.find("/")])))
# replaces price value to intiger # replaces price value to intiger
for value in df_out["Cena"]: for value in df_combined[PRICE]:
df_out = df_out.replace(value, replace_value(value)) df_combined = df_combined.replace(value, replace_value(value, " ", ",", ""))
return df_out.sort_values(by="Cena")
for _ in df_combined[ROOM_AMOUNT]:
df_combined = df_combined.replace(["citi", "Citi"], "2")
try:
for value in df_combined[ROOM_AMOUNT]:
df_combined = df_combined.replace(value, int(value))
except:
pass
# converts to datetime
df_combined[PUB_DATE] = pd.to_datetime(df_combined[PUB_DATE], format="%d.%m.%Y")
# df_combined.to_excel("output/excel/combined.xlsx", index=False)
return df_combined.sort_values(by=[PRICE, PUB_DATE])
def replace_value(value): # replace value
new_value = value[:value.find(" ")] replace_value = lambda value, find, replace, replace_to: int(value[:value.find(find)].replace(replace, replace_to))
new_value = new_value.replace(",", "")
return int(new_value)
def graph_plot():
data = get_data()
graph_corr(data)
graph_price(data)
def graph_corr(data): def graph_corr(data):
data_corr = data.copy() data_corr = data.copy()
plt.rc("font", size=8)
# gets all series
series = [] series = []
for i in data_corr["Sērija"]: for i in data_corr[SERIES]:
if i not in series: if i not in series:
series.append(i) series.append(i)
j = 0 # change series names to numbers
for s in series: data_corr[SERIES] = data_corr[SERIES].replace(series, range(len(series)))
data_corr = list(map(lambda x: x.replace(s, j), data_corr))
j += 1
print(data_corr["Sērija"])
sns.heatmap(data_corr.corr()) sns.heatmap(data_corr.corr())
plt.savefig(f"{output_path}/korelacija.png") plt.savefig(f"{output_path}/korelacija.png")
calc_average(data_corr)
def graph_price(data): def graph_price(data):
# plot settings
plt.figure(figsize=(50, 30)) plt.figure(figsize=(50, 30))
plt.rc("font", size=15) plt.rc("font", size=15)
# plt.rc("font", titlesize=24)
# placing the plots in the plane plot1 = priceGraphs(data, (0, 0), FLOOR, "Price to floor", "Floor")
plot1 = plt.subplot2grid((3, 2), (0, 0)) plot2 = priceGraphs(data, (0, 1), ROOM_AMOUNT, "Price to room amount", "Room amount")
plot2 = plt.subplot2grid((3, 2), (0, 1)) plot3 = priceGraphs(data, (1, 0), QUADRATURE, "Price to quadrature", "Quadrature")
plot3 = plt.subplot2grid((3, 2), (1, 0)) plot4 = priceGraphs(data, (1, 1), SERIES, "Price to series", "Series")
plot4 = plt.subplot2grid((3, 2), (1, 1)) plot5 = priceGraphs(data, (2, 0), PUB_DATE, "Price to date", "Date")
plot5 = plt.subplot2grid((3, 2), (2, 0))
# floor to price plot1._graph_price()
plot1.scatter(data["Cena"], data["Stāvs"]) plot2._graph_price()
plot1.set_title("Floor to price") plot3._graph_price()
plot1.set_xlabel("Price") plot4._graph_price()
plot1.set_ylabel("Floor") plot5._graph_price()
# room amount to price
plot2.scatter(data["Cena"], data["Istabu skaits"])
plot2.set_title("Room amount to price")
plot2.set_xlabel("Price")
plot2.set_ylabel("Room amount")
# quadrature to price
plot3.scatter(data["Cena"], data["Kvadratūra"])
plot3.set_title("Quadrature to price")
plot3.set_xlabel("Price")
plot3.set_ylabel("Quadrature")
# series to price
plot4.scatter(data["Cena"], data["Sērija"])
plot4.set_title("Series to price")
plot4.set_xlabel("Price")
plot4.set_ylabel("Series")
# date to price
plot5.scatter(data["Cena"], data["Izvietošanas datums"])
plot5.set_title("Date to price")
plot5.set_xlabel("Price")
plot5.set_ylabel("Date")
plt.savefig(f"{output_path}/cenu_grafiki.png") plt.savefig(f"{output_path}/cenu_grafiki.png")
def create_pdf(data):
pdf = FPDF("P", "mm", "A4")
pdf.add_page()
pdf.add_font("Roboto", fname="fonts/Roboto-Regular.ttf", uni=True)
pdf.set_font("Roboto", size=12)
usable_w = pdf.w - 2 * pdf.l_margin
width = usable_w / 7
hight = pdf.font_size * 2
LINE_HIGHT = 5
columns = [PRICE, FLOOR, ROOM_AMOUNT, SERIES, QUADRATURE, PUB_DATE]
for column in columns:
if column == PUB_DATE:
col_width = width * 2
else:
col_width = width
pdf.cell(col_width, hight, column, border=1)
pdf.ln(hight)
pdf.set_font()
for _ in range(5):
rand_num = randint(2, len(data) - 10)
for column in columns:
if column == PUB_DATE:
col_width = width * 2
else:
col_width = width
pdf.cell(col_width, hight, str(data[column][rand_num]), border=1)
pdf.ln(hight)
text = """
"Price to floor" grafiks - lielākā daļa pārdodamo dzīvokļu ir līdz 5. stāvam.
"Price to room amount" grafiks - jo mazāk istabu, jo lētāks dzīvoklis.
"Price to quadrature" grafiks - jo lielāka dzīvokļa platība, jo dārgāks dzīvoklis.
"Price to series" grafiks - dārgākie dzīvokļi ir jaunie.
"Price to date" grafiks - nesen pārdošanā ielikto dzīvokļu ir vairāk.
"""
pdf.ln(hight)
pdf.image(f"{output_path}/korelacija.png", w=usable_w)
# pdf.write(LINE_HIGHT, "Starp istabu skaitu un cenu, kvadratūru un cenu ir liela korelācija.")
pdf.image(f"{output_path}/cenu_grafiki.png", w=usable_w)
for txt in text.split("\n"):
pdf.write(LINE_HIGHT, txt.strip())
pdf.ln(LINE_HIGHT)
average = calc_average(data)
for key, value in average.items():
if not isinstance(value, str):
value = str(round(value))
pdf.write(LINE_HIGHT, f"{key} - {value}")
pdf.ln(LINE_HIGHT)
response = requests.get(series_photos[average[SERIES]])
img = Image.open(BytesIO(response.content))
pdf.image(img)
pdf.output("output/pdf.pdf")
def calc_average(data):
columns = [FLOOR, ROOM_AMOUNT, SERIES, QUADRATURE]
mean_price_columns = {FLOOR: None, ROOM_AMOUNT: None, SERIES: None, QUADRATURE: None}
for column in columns:
if column == SERIES:
# print(data[column])
# print(f"{column} = {mode(data[column])}")
mean_price_columns[column] = (mode(data[SERIES]))
else:
# print(f"{column} = {mode(data[column])}")
mean_price_columns[column] = mode(data[PRICE]) / mode(data[column])
return mean_price_columns
def graph_plot():
data = read()
graph_corr(data)
graph_price(data)
create_pdf(data)
def main(): def main():
graph_plot() graph_plot()

View File

@ -14,6 +14,7 @@ HEADERS = {
class SS: class SS:
def __init__(self, url, name): def __init__(self, url, name):
self.url = url self.url = url
self.name = name self.name = name
@ -77,24 +78,20 @@ class SS:
chunked_items_list = [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)] # combines each 'chunk_size' elements into array chunked_items_list = [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)] # combines each 'chunk_size' elements into array
columns = ["Atrašanās vieta", "Istabu skaits", "Kvadratūra", "Stāvs", "Sērija", "Cena", "Pilns sludinājuma teksts", "Izvietošanas datums"] columns = ["Atrašanās vieta", "Istabu skaits", "Kvadratūra", "Stāvs", "Sērija", "Cena", "Pilns sludinājuma teksts", "Izvietošanas datums"]
df = pd.DataFrame(chunked_items_list, columns=columns) df = pd.DataFrame(chunked_items_list, columns=columns)
time = datetime.now().strftime("%d%m%Y%H%M%S") time = datetime.now().strftime("%d%m%y%H%M%S") # current time
df.to_excel(excel_writer=f"output/excel/output_{self.name}_{time}.xlsx", index=False) df.to_excel(excel_writer=f"output/excel/ss_{self.name}_{time}.xlsx", index=False)
print("Done") print("Done")
flats_many = SS("https://www.ss.com/lv/real-estate/flats/riga/all/sell/", "many") flats_riga = SS("https://www.ss.com/lv/real-estate/flats/riga/all/sell/", "riga")
flats_few = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/", "few") flats_rigareg = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/", "rigareg")
flats_aizkraukle = SS("https://www.ss.com/lv/real-estate/flats/aizkraukle-and-reg/sell/", "aizkraukle") flats_aizkraukle = SS("https://www.ss.com/lv/real-estate/flats/aizkraukle-and-reg/sell/", "aizkraukle")
flats_tukums = SS("https://www.ss.com/lv/real-estate/flats/tukums-and-reg/sell/", "tukums") flats_tukums = SS("https://www.ss.com/lv/real-estate/flats/tukums-and-reg/sell/", "tukums")
flats_ogre = SS("https://www.ss.com/lv/real-estate/flats/ogre-and-reg/sell/", "ogre") flats_ogre = SS("https://www.ss.com/lv/real-estate/flats/ogre-and-reg/sell/", "ogre")
def main(): def main():
# flats_aizkraukle.get_data() flats_riga.get_data()
# flats_tukums.get_data()
# flats_ogre.get_data()
# flats_few.get_data()
flats_many.get_data()
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -1,5 +1,2 @@
from datetime import datetime val = "5.00"
print(int((float(val))))
time = datetime.now().strftime("%d%m%Y%H%M%S")
print(time)