School/february/task_180222/pd_pandas_k_f_cagulis.py
2022-08-02 20:34:11 +03:00

276 lines
8.7 KiB
Python

# Author - Kristiāns Francis Cagulis
# Date - 21.02.2022.
# Title - Patstāvīgais darbs - pandas
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import requests
import sys
from os import mkdir, listdir
from pathlib import Path
from random import randint
from fpdf import FPDF
from statistics import mode, mean
from PIL import Image
from io import BytesIO
from ss_scraper import SS
output_path = "output/graphs"
all_df = []
QUADRATURE = "Kvadratūra"
FLOOR = "Stāvs"
PRICE = "Cena"
SERIES = "Sērija"
ROOM_AMOUNT = "Istabu skaits"
PUB_DATE = "Izvietošanas datums"
COLUMNS = [PRICE, FLOOR, ROOM_AMOUNT, SERIES, QUADRATURE, PUB_DATE]
series_photos = {
"103.": "https://i.ss.com/gallery/5/902/225301/45060087.th2.jpg",
"104.": "https://i.ss.com/gallery/5/888/221910/44381841.th2.jpg",
"119.": "https://i.ss.com/gallery/5/902/225443/45088567.th2.jpg",
"467.": "https://i.ss.com/gallery/5/892/222881/44576186.th2.jpg",
"602.": "https://i.ss.com/gallery/5/896/223820/44763891.th2.jpg",
"Čehu pr.": "https://i.ss.com/gallery/5/902/225358/45071499.th2.jpg",
"Hrušč.": "https://i.ss.com/gallery/5/896/223961/44792152.th2.jpg",
"LT proj.": "https://i.ss.com/gallery/5/873/218203/43640498.th2.jpg",
"M. ģim.": "https://i.ss.com/gallery/5/871/217506/43501012.th2.jpg",
"P. kara": "https://i.ss.com/gallery/5/902/225490/45097851.th2.jpg",
"Priv. m.": "https://i.ss.com/gallery/5/895/223697/44739240.th2.jpg",
"Renov.": "https://i.ss.com/gallery/5/902/225442/45088303.th2.jpg",
"Specpr.": "https://i.ss.com/gallery/5/902/225492/45098378.th2.jpg",
"Staļina": "https://i.ss.com/gallery/5/902/225440/45087952.th2.jpg",
"Jaun.": "https://i.ss.com/gallery/5/902/225456/45091154.th2.jpg"
}
class priceGraph:
def __init__(self, data, pos, title, x_value, xlabel, xticks=None, y_value=PRICE, ylabel="Price"):
self.pos = pos
self.x_value = data[x_value]
self.y_value = data[y_value]
self.title = title
self.xlabel = xlabel
self.ylabel = ylabel
self.xticks = xticks
def _graph_price(self):
plot = plt.subplot2grid((3, 2), self.pos)
plot.scatter(self.x_value, self.y_value)
plot.set_title(self.title)
plot.set_xlabel(self.xlabel)
plot.set_ylabel(self.ylabel)
if self.xticks != None:
plot.set_xticks(self.xticks)
def read():
files = list(Path(Path(__file__).parent.absolute()).glob("**/*.xlsx"))
for file_path in files:
all_df.append(pd.read_excel(file_path))
df_combined = pd.concat(all_df).reset_index(
drop=True) # combine DataFrames
df_combined.sort_values(by=[PRICE, PUB_DATE],
inplace=True) # sort DataFrame
df_combined.drop_duplicates(keep="first", inplace=True) # drop duplicates
# replaces floor value to intiger
for value in df_combined[FLOOR]:
df_combined = df_combined.replace(
value, int(float(value[:value.find("/")])))
# replaces price value to intiger
for value in df_combined[PRICE]:
df_combined = df_combined.replace(
value, replace_value(value, " ", ",", ""))
# replaces "Citi" to 7
for _ in df_combined[ROOM_AMOUNT]:
df_combined = df_combined.replace(["citi", "Citi"], "7")
# converts room amount to intiger
for value in df_combined[ROOM_AMOUNT]:
df_combined = df_combined.replace(value, int(value))
# converts to datetime
df_combined[PUB_DATE] = pd.to_datetime(
df_combined[PUB_DATE], format="%d.%m.%Y").dt.date
df_combined.to_excel("output/excel/combined.xlsx", index=False)
return df_combined.sort_values(by=PUB_DATE)
# replace value
def replace_value(value, find, replace, replace_to): return int(
value[:value.find(find)].replace(replace, replace_to))
def graph_corr(data):
data_corr = data.copy()
plt.rc("font", size=8)
# gets all series
series = []
for i in data_corr[SERIES]:
if i not in series:
series.append(i)
# change series names to numbers
data_corr[SERIES] = data_corr[SERIES].replace(series, range(len(series)))
sns.heatmap(data_corr.corr())
plt.savefig(f"{output_path}/korelacija.png")
def graph_price(data):
plt.figure(figsize=(50, 30))
plt.rc("font", size=15)
plot1 = priceGraph(data, (0, 0), "Price to floor", FLOOR,
"Floor", range(1, max(data[FLOOR]) + 1))
plot2 = priceGraph(data, (0, 1), "Price to room amount",
ROOM_AMOUNT, "Room amount")
plot3 = priceGraph(data, (1, 0), "Price to quadrature",
QUADRATURE, "Quadrature")
plot4 = priceGraph(data, (1, 1), "Price to series", SERIES, "Series")
plot5 = priceGraph(data, (2, 0), "Price to date", PUB_DATE, "Date")
plot1._graph_price()
plot2._graph_price()
plot3._graph_price()
plot4._graph_price()
plot5._graph_price()
plt.savefig(f"{output_path}/cenu_grafiki.png")
def create_pdf(data):
pdf = FPDF("P", "mm", "A4")
pdf.add_page()
pdf.add_font("Roboto", fname="fonts/Roboto-Regular.ttf", uni=True)
pdf.set_font("Roboto", size=12)
usable_w = pdf.w - 2 * pdf.l_margin
width = usable_w / 7
height = pdf.font_size * 2
LINE_HEIGHT = 5
# table head
for column in COLUMNS:
if column == PUB_DATE:
col_width = width * 2
else:
col_width = width
pdf.cell(col_width, height, column, border=1)
pdf.ln(height)
# table contents
for _ in range(5):
rand_num = randint(2, len(data))
for column in COLUMNS:
if column == PUB_DATE:
col_width = width * 2
else:
col_width = width
pdf.cell(col_width, height, str(
data[column].iloc[rand_num]), border=1)
pdf.ln(height)
pdf.ln(height)
pdf.image(f"{output_path}/korelacija.png", w=usable_w) # corr graph
pdf.write(
LINE_HEIGHT, "Starp istabu skaitu un cenu, kvadratūru un cenu ir liela korelācija.")
pdf.ln(height)
pdf.image(f"{output_path}/cenu_grafiki.png", w=usable_w) # price graph
# price graph conclusions
text = """
"Price to floor" grafiks - lielākā daļa pārdodamo dzīvokļu ir līdz 6. stāvam.
"Price to room amount" grafiks - veido normālo sadalījumu (Gausa sadalījumu).
"Price to quadrature" grafiks - jo lielāka dzīvokļa platība, jo dārgāks dzīvoklis.
"Price to series" grafiks - jaunie, renovētie un pēc kara dzīvokļi ir dārgāki.
"Price to date" grafiks - nav nekādas sakarības.
"""
for txt in text.split("\n"):
pdf.write(LINE_HEIGHT, txt.strip())
pdf.ln(LINE_HEIGHT)
# mean/mode values
text = [
"Vidējā cena: ", "Vidējā cena attiecībā pret kvadratūru: ", "Sērijas moda: ", "Vidējā cena attiecībā pret istabu skaitu: ",
"Vidējā cena attiecībā pret stāvu: "
]
values = [
round(mean(data[PRICE]), 2),
round(mean(data[PRICE]) / mean(data[QUADRATURE])),
mode(data[SERIES]),
round(mean(data[PRICE]) / mean(data[ROOM_AMOUNT])),
round(mean(data[PRICE]) / mean(data[FLOOR]))
]
for txt, value in zip(text, values):
pdf.write(LINE_HEIGHT, f"{txt}{value}")
pdf.ln(LINE_HEIGHT)
# adds photo of most frequent series
response = requests.get(series_photos[mode(data[SERIES])])
img = Image.open(BytesIO(response.content))
pdf.image(img)
pdf.output("output/pdf/secinajumi.pdf")
def make_dir():
if "output" not in listdir():
mkdir("output")
if "excel" not in listdir("output"):
mkdir("output/excel")
if "graphs" not in listdir("output"):
mkdir("output/graphs")
if "pdf" not in listdir("output"):
mkdir("output/pdf")
def graph_plot():
data = read()
graph_corr(data)
graph_price(data)
create_pdf(data)
flats_riga = SS(
"https://www.ss.com/lv/real-estate/flats/riga/all/sell/", "riga")
flats_rigareg = SS(
"https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/", "rigareg")
flats_aizkraukle = SS(
"https://www.ss.com/lv/real-estate/flats/aizkraukle-and-reg/sell/", "aizkraukle")
flats_tukums = SS(
"https://www.ss.com/lv/real-estate/flats/tukums-and-reg/sell/", "tukums")
flats_ogre = SS(
"https://www.ss.com/lv/real-estate/flats/ogre-and-reg/sell/", "ogre")
OPERATIONS = """
python pd_pandas_k_f_cagulis.py
python pd_pandas_k_f_cagulis.py <operations>
Operations:
-h --help
-n --new Scrape new file
"""
def main(argv):
for arg in argv:
if arg in ["-h", "--help"]:
print(OPERATIONS)
exit()
elif arg in ["-n", "--new"]:
flats_riga.get_data()
make_dir()
graph_plot()
if __name__ == "__main__":
main(sys.argv[1:])