School/february/task_180222/pd_pandas_k_f_cagulis.py
Kristofers-Solo b3fc9f2cc3 task_180222
2022-02-21 16:34:52 +02:00

235 lines
7.3 KiB
Python

# Author - Kristiāns Francis Cagulis
# Date - 17.02.2022.
# Title - Patstāvīgais darbs - pandas
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import requests
import sys
from pathlib import Path
from random import randint
from fpdf import FPDF
from statistics import mode
from PIL import Image
from io import BytesIO
from ss_scraper import SS
output_path = "output/graphs"
all_df = []
QUADRATURE = "Kvadratūra"
FLOOR = "Stāvs"
PRICE = "Cena"
SERIES = "Sērija"
ROOM_AMOUNT = "Istabu skaits"
PUB_DATE = "Izvietošanas datums"
COLUMNS = [PRICE, FLOOR, ROOM_AMOUNT, SERIES, QUADRATURE, PUB_DATE]
series_photos = {
"103.": "https://i.ss.com/gallery/5/902/225301/45060087.th2.jpg",
"104.": "https://i.ss.com/gallery/5/888/221910/44381841.th2.jpg",
"119.": "https://i.ss.com/gallery/5/902/225443/45088567.th2.jpg",
"467.": "https://i.ss.com/gallery/5/892/222881/44576186.th2.jpg",
"602.": "https://i.ss.com/gallery/5/896/223820/44763891.th2.jpg",
"Čehu pr.": "https://i.ss.com/gallery/5/902/225358/45071499.th2.jpg",
"Hrušč.": "https://i.ss.com/gallery/5/896/223961/44792152.th2.jpg",
"LT proj.": "https://i.ss.com/gallery/5/873/218203/43640498.th2.jpg",
"M. ģim.": "https://i.ss.com/gallery/5/871/217506/43501012.th2.jpg",
"P. kara": "https://i.ss.com/gallery/5/902/225490/45097851.th2.jpg",
"Priv. m.": "https://i.ss.com/gallery/5/895/223697/44739240.th2.jpg",
"Renov.": "https://i.ss.com/gallery/5/902/225442/45088303.th2.jpg",
"Specpr.": "https://i.ss.com/gallery/5/902/225492/45098378.th2.jpg",
"Staļina": "https://i.ss.com/gallery/5/902/225440/45087952.th2.jpg",
"Jaun.": "https://i.ss.com/gallery/5/902/225456/45091154.th2.jpg"
}
class priceGraphs:
def __init__(self, data, pos, x_value, title, xlabel, y_value=PRICE, ylabel="Price"):
self.pos = pos
self.x_value = data[x_value]
self.y_value = data[y_value]
self.title = title
self.xlabel = xlabel
self.ylabel = ylabel
def _graph_price(self):
plot = plt.subplot2grid((3, 2), self.pos)
plot.scatter(self.x_value, self.y_value)
plot.set_title(self.title)
plot.set_xlabel(self.xlabel)
plot.set_ylabel(self.ylabel)
def read():
files = list(Path(Path(__file__).parent.absolute()).glob("**/*.xlsx"))
for file_path in files:
all_df.append(pd.read_excel(file_path))
df_combined = pd.concat(all_df).reset_index(drop=True)
df_combined.sort_values(by=[PRICE, PUB_DATE], inplace=True)
df_combined.drop_duplicates(subset="Pilns sludinājuma teksts", keep=False, inplace=True)
# replaces floor value to intiger
for value in df_combined[FLOOR]:
df_combined = df_combined.replace(value, int(float(value[:value.find("/")])))
# replaces price value to intiger
for value in df_combined[PRICE]:
df_combined = df_combined.replace(value, replace_value(value, " ", ",", ""))
for _ in df_combined[ROOM_AMOUNT]:
df_combined = df_combined.replace(["citi", "Citi"], "7")
try:
for value in df_combined[ROOM_AMOUNT]:
df_combined = df_combined.replace(value, int(value))
except:
pass
# converts to datetime
df_combined[PUB_DATE] = pd.to_datetime(df_combined[PUB_DATE], format="%d.%m.%Y")
# df_combined.to_excel("output/excel/combined.xlsx", index=False)
return df_combined.sort_values(by=[PRICE, PUB_DATE])
# replace value
replace_value = lambda value, find, replace, replace_to: int(value[:value.find(find)].replace(replace, replace_to))
def graph_corr(data):
data_corr = data.copy()
plt.rc("font", size=8)
# gets all series
series = []
for i in data_corr[SERIES]:
if i not in series:
series.append(i)
# change series names to numbers
data_corr[SERIES] = data_corr[SERIES].replace(series, range(len(series)))
sns.heatmap(data_corr.corr())
plt.savefig(f"{output_path}/korelacija.png")
def graph_price(data):
plt.figure(figsize=(50, 30))
plt.rc("font", size=15)
plot1 = priceGraphs(data, (0, 0), FLOOR, "Price to floor", "Floor")
plot2 = priceGraphs(data, (0, 1), ROOM_AMOUNT, "Price to room amount", "Room amount")
plot3 = priceGraphs(data, (1, 0), QUADRATURE, "Price to quadrature", "Quadrature")
plot4 = priceGraphs(data, (1, 1), SERIES, "Price to series", "Series")
plot5 = priceGraphs(data, (2, 0), PUB_DATE, "Price to date", "Date")
plot1._graph_price()
plot2._graph_price()
plot3._graph_price()
plot4._graph_price()
plot5._graph_price()
plt.savefig(f"{output_path}/cenu_grafiki.png")
def create_pdf(data):
pdf = FPDF("P", "mm", "A4")
pdf.add_page()
pdf.add_font("Roboto", fname="fonts/Roboto-Regular.ttf", uni=True)
pdf.set_font("Roboto", size=12)
usable_w = pdf.w - 2 * pdf.l_margin
width = usable_w / 7
height = pdf.font_size * 2
LINE_HEIGHT = 5
for column in COLUMNS:
if column == PUB_DATE:
col_width = width * 2
else:
col_width = width
pdf.cell(col_width, height, column, border=1)
pdf.ln(height)
for _ in range(5):
rand_num = randint(2, len(data))
# print(str(data[column].iloc[rand_num])) # TODO: ERROR
for column in COLUMNS:
if column == PUB_DATE:
col_width = width * 2
else:
col_width = width
pdf.cell(col_width, height, str(data[column].iloc[rand_num]), border=1)
pdf.ln(height)
text = """
"Price to floor" grafiks - lielākā daļa pārdodamo dzīvokļu ir līdz 5. stāvam.
"Price to room amount" grafiks - jo mazāk istabu, jo lētāks dzīvoklis.
"Price to quadrature" grafiks - jo lielāka dzīvokļa platība, jo dārgāks dzīvoklis.
"Price to series" grafiks - dārgākie dzīvokļi ir jaunie.
"Price to date" grafiks - nesen pārdošanā ielikto dzīvokļu ir vairāk.
"""
pdf.ln(height)
pdf.image(f"{output_path}/korelacija.png", w=usable_w)
# pdf.write(LINE_HEIGHT, "Starp istabu skaitu un cenu, kvadratūru un cenu ir liela korelācija.")
pdf.image(f"{output_path}/cenu_grafiki.png", w=usable_w)
for txt in text.split("\n"):
pdf.write(LINE_HEIGHT, txt.strip())
pdf.ln(LINE_HEIGHT)
average = calc_mode(data)
# print(average)
for key, value in average.items():
print(f"{key} - {value}")
# if not isinstance(value, str):
# value = str(round(value))
pdf.write(LINE_HEIGHT, f"{key} - {value}")
pdf.ln(LINE_HEIGHT)
# response = requests.get(series_photos[average[SERIES]])
# img = Image.open(BytesIO(response.content))
# pdf.image(img)
pdf.output("output/pdf.pdf")
def calc_mode(data):
mode_columns = {}
for column in COLUMNS:
mode_columns[column] = (mode(data[column]))
# if column == SERIES:
# print(data[column])
# print(f"{column} = {mode(data[column])}")
# else:
# print(f"{column} = {mode(data[column])}")
# mean_price_columns[column] = mode(data[PRICE]) / mode(data[column])
return mode_columns
def graph_plot():
data = read()
graph_corr(data)
graph_price(data)
create_pdf(data)
flats_riga = SS("https://www.ss.com/lv/real-estate/flats/riga/all/sell/", "riga")
flats_rigareg = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/", "rigareg")
flats_aizkraukle = SS("https://www.ss.com/lv/real-estate/flats/aizkraukle-and-reg/sell/", "aizkraukle")
flats_tukums = SS("https://www.ss.com/lv/real-estate/flats/tukums-and-reg/sell/", "tukums")
flats_ogre = SS("https://www.ss.com/lv/real-estate/flats/ogre-and-reg/sell/", "ogre")
def main(argv):
for arg in argv:
if arg == "-h" or arg == "--help":
print(f"{__file__} -N --new Scrape new file")
exit()
elif arg == "-n" or arg == "--new":
flats_riga.get_data()
# flats_ogre.get_data()
graph_plot()
if __name__ == "__main__":
main(sys.argv[1:])