mirror of
https://github.com/kristoferssolo/School.git
synced 2025-10-21 20:10:38 +00:00
task_180222
This commit is contained in:
parent
8f9c22e1d6
commit
2489e585e5
1
.gitignore
vendored
1
.gitignore
vendored
@ -4,3 +4,4 @@
|
||||
**.log
|
||||
/december/task_081221/files
|
||||
/december/task_081221/*.log
|
||||
**.plk
|
||||
BIN
february/task_180222/Roboto-Regular.pkl
Normal file
BIN
february/task_180222/Roboto-Regular.pkl
Normal file
Binary file not shown.
BIN
february/task_180222/fonts/Roboto-Regular.ttf
Normal file
BIN
february/task_180222/fonts/Roboto-Regular.ttf
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
february/task_180222/output/excel/ss_riga_190222124051.xlsx
Normal file
BIN
february/task_180222/output/excel/ss_riga_190222124051.xlsx
Normal file
Binary file not shown.
BIN
february/task_180222/output/excel/ss_riga_210222000729.xlsx
Normal file
BIN
february/task_180222/output/excel/ss_riga_210222000729.xlsx
Normal file
Binary file not shown.
Binary file not shown.
|
Before Width: | Height: | Size: 341 KiB After Width: | Height: | Size: 258 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 17 KiB After Width: | Height: | Size: 14 KiB |
BIN
february/task_180222/output/pdf.pdf
Normal file
BIN
february/task_180222/output/pdf.pdf
Normal file
Binary file not shown.
@ -2,10 +2,16 @@
|
||||
# Date - 17.02.2022.
|
||||
# Title - Patstāvīgais darbs - pandas
|
||||
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
import requests
|
||||
from pathlib import Path
|
||||
from random import randint
|
||||
from fpdf import FPDF
|
||||
from statistics import mode
|
||||
from PIL import Image
|
||||
from io import BytesIO
|
||||
from ss_scraper import SS
|
||||
|
||||
# flats_few = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/")
|
||||
@ -13,105 +19,202 @@ from ss_scraper import SS
|
||||
output_path = "output/graphs"
|
||||
all_df = []
|
||||
|
||||
QUADRATURE = "Kvadratūra"
|
||||
FLOOR = "Stāvs"
|
||||
PRICE = "Cena"
|
||||
SERIES = "Sērija"
|
||||
ROOM_AMOUNT = "Istabu skaits"
|
||||
PUB_DATE = "Izvietošanas datums"
|
||||
|
||||
def read(path):
|
||||
df = pd.read_excel(path)
|
||||
all_df.append(df)
|
||||
series_photos = {
|
||||
"103.": "https://i.ss.com/gallery/5/902/225301/45060087.th2.jpg",
|
||||
"104.": "https://i.ss.com/gallery/5/888/221910/44381841.th2.jpg",
|
||||
"119.": "https://i.ss.com/gallery/5/902/225443/45088567.th2.jpg",
|
||||
"467.": "https://i.ss.com/gallery/5/892/222881/44576186.th2.jpg",
|
||||
"602.": "https://i.ss.com/gallery/5/896/223820/44763891.th2.jpg",
|
||||
"Čehu pr.": "https://i.ss.com/gallery/5/902/225358/45071499.th2.jpg",
|
||||
"Hrušč.": "https://i.ss.com/gallery/5/896/223961/44792152.th2.jpg",
|
||||
"LT proj.": "https://i.ss.com/gallery/5/873/218203/43640498.th2.jpg",
|
||||
"M. ģim.": "https://i.ss.com/gallery/5/871/217506/43501012.th2.jpg",
|
||||
"P. kara": "https://i.ss.com/gallery/5/902/225490/45097851.th2.jpg",
|
||||
"Priv. m.": "https://i.ss.com/gallery/5/895/223697/44739240.th2.jpg",
|
||||
"Renov.": "https://i.ss.com/gallery/5/902/225442/45088303.th2.jpg",
|
||||
"Specpr.": "https://i.ss.com/gallery/5/902/225492/45098378.th2.jpg",
|
||||
"Staļina": "https://i.ss.com/gallery/5/902/225440/45087952.th2.jpg",
|
||||
"Jaun.": "https://i.ss.com/gallery/5/902/225456/45091154.th2.jpg"
|
||||
}
|
||||
|
||||
|
||||
def get_data():
|
||||
class priceGraphs:
|
||||
|
||||
def __init__(self, data, pos, x_value, title, xlabel, y_value=PRICE, ylabel="Price"):
|
||||
self.pos = pos
|
||||
self.x_value = data[x_value]
|
||||
self.y_value = data[y_value]
|
||||
self.title = title
|
||||
self.xlabel = xlabel
|
||||
self.ylabel = ylabel
|
||||
|
||||
def _graph_price(self):
|
||||
plot = plt.subplot2grid((3, 2), self.pos)
|
||||
plot.scatter(self.x_value, self.y_value)
|
||||
plot.set_title(self.title)
|
||||
plot.set_xlabel(self.xlabel)
|
||||
plot.set_ylabel(self.ylabel)
|
||||
|
||||
|
||||
def read():
|
||||
files = list(Path(Path(__file__).parent.absolute()).glob("**/*.xlsx"))
|
||||
|
||||
for file in files:
|
||||
read(file)
|
||||
df_out = pd.concat(all_df).reset_index(drop=True)
|
||||
# df_out.to_excel("output/excel/combined.xlsx", index=False)
|
||||
for file_path in files:
|
||||
all_df.append(pd.read_excel(file_path))
|
||||
df_combined = pd.concat(all_df).reset_index(drop=True)
|
||||
df_combined.sort_values(by=[PRICE, PUB_DATE], inplace=True)
|
||||
df_combined.drop_duplicates(subset="Pilns sludinājuma teksts", keep=False, inplace=True)
|
||||
|
||||
# replaces floor value to intiger
|
||||
for value in df_out["Stāvs"]:
|
||||
df_out = df_out.replace(value, int(value[:value.find("/")]))
|
||||
for value in df_combined[FLOOR]:
|
||||
df_combined = df_combined.replace(value, int(float(value[:value.find("/")])))
|
||||
|
||||
# replaces price value to intiger
|
||||
for value in df_out["Cena"]:
|
||||
df_out = df_out.replace(value, replace_value(value))
|
||||
return df_out.sort_values(by="Cena")
|
||||
for value in df_combined[PRICE]:
|
||||
df_combined = df_combined.replace(value, replace_value(value, " ", ",", ""))
|
||||
|
||||
for _ in df_combined[ROOM_AMOUNT]:
|
||||
df_combined = df_combined.replace(["citi", "Citi"], "2")
|
||||
try:
|
||||
for value in df_combined[ROOM_AMOUNT]:
|
||||
df_combined = df_combined.replace(value, int(value))
|
||||
except:
|
||||
pass
|
||||
# converts to datetime
|
||||
df_combined[PUB_DATE] = pd.to_datetime(df_combined[PUB_DATE], format="%d.%m.%Y")
|
||||
|
||||
# df_combined.to_excel("output/excel/combined.xlsx", index=False)
|
||||
return df_combined.sort_values(by=[PRICE, PUB_DATE])
|
||||
|
||||
|
||||
def replace_value(value):
|
||||
new_value = value[:value.find(" ")]
|
||||
new_value = new_value.replace(",", "")
|
||||
return int(new_value)
|
||||
|
||||
|
||||
def graph_plot():
|
||||
data = get_data()
|
||||
graph_corr(data)
|
||||
graph_price(data)
|
||||
# replace value
|
||||
replace_value = lambda value, find, replace, replace_to: int(value[:value.find(find)].replace(replace, replace_to))
|
||||
|
||||
|
||||
def graph_corr(data):
|
||||
data_corr = data.copy()
|
||||
|
||||
plt.rc("font", size=8)
|
||||
# gets all series
|
||||
series = []
|
||||
for i in data_corr["Sērija"]:
|
||||
for i in data_corr[SERIES]:
|
||||
if i not in series:
|
||||
series.append(i)
|
||||
j = 0
|
||||
for s in series:
|
||||
data_corr = list(map(lambda x: x.replace(s, j), data_corr))
|
||||
j += 1
|
||||
# change series names to numbers
|
||||
data_corr[SERIES] = data_corr[SERIES].replace(series, range(len(series)))
|
||||
|
||||
print(data_corr["Sērija"])
|
||||
sns.heatmap(data_corr.corr())
|
||||
plt.savefig(f"{output_path}/korelacija.png")
|
||||
calc_average(data_corr)
|
||||
|
||||
|
||||
def graph_price(data):
|
||||
# plot settings
|
||||
plt.figure(figsize=(50, 30))
|
||||
plt.rc("font", size=15)
|
||||
# plt.rc("font", titlesize=24)
|
||||
|
||||
# placing the plots in the plane
|
||||
plot1 = plt.subplot2grid((3, 2), (0, 0))
|
||||
plot2 = plt.subplot2grid((3, 2), (0, 1))
|
||||
plot3 = plt.subplot2grid((3, 2), (1, 0))
|
||||
plot4 = plt.subplot2grid((3, 2), (1, 1))
|
||||
plot5 = plt.subplot2grid((3, 2), (2, 0))
|
||||
plot1 = priceGraphs(data, (0, 0), FLOOR, "Price to floor", "Floor")
|
||||
plot2 = priceGraphs(data, (0, 1), ROOM_AMOUNT, "Price to room amount", "Room amount")
|
||||
plot3 = priceGraphs(data, (1, 0), QUADRATURE, "Price to quadrature", "Quadrature")
|
||||
plot4 = priceGraphs(data, (1, 1), SERIES, "Price to series", "Series")
|
||||
plot5 = priceGraphs(data, (2, 0), PUB_DATE, "Price to date", "Date")
|
||||
|
||||
# floor to price
|
||||
plot1.scatter(data["Cena"], data["Stāvs"])
|
||||
plot1.set_title("Floor to price")
|
||||
plot1.set_xlabel("Price")
|
||||
plot1.set_ylabel("Floor")
|
||||
|
||||
# room amount to price
|
||||
plot2.scatter(data["Cena"], data["Istabu skaits"])
|
||||
plot2.set_title("Room amount to price")
|
||||
plot2.set_xlabel("Price")
|
||||
plot2.set_ylabel("Room amount")
|
||||
|
||||
# quadrature to price
|
||||
plot3.scatter(data["Cena"], data["Kvadratūra"])
|
||||
plot3.set_title("Quadrature to price")
|
||||
plot3.set_xlabel("Price")
|
||||
plot3.set_ylabel("Quadrature")
|
||||
|
||||
# series to price
|
||||
plot4.scatter(data["Cena"], data["Sērija"])
|
||||
plot4.set_title("Series to price")
|
||||
plot4.set_xlabel("Price")
|
||||
plot4.set_ylabel("Series")
|
||||
|
||||
# date to price
|
||||
plot5.scatter(data["Cena"], data["Izvietošanas datums"])
|
||||
plot5.set_title("Date to price")
|
||||
plot5.set_xlabel("Price")
|
||||
plot5.set_ylabel("Date")
|
||||
plot1._graph_price()
|
||||
plot2._graph_price()
|
||||
plot3._graph_price()
|
||||
plot4._graph_price()
|
||||
plot5._graph_price()
|
||||
|
||||
plt.savefig(f"{output_path}/cenu_grafiki.png")
|
||||
|
||||
|
||||
def create_pdf(data):
|
||||
pdf = FPDF("P", "mm", "A4")
|
||||
pdf.add_page()
|
||||
pdf.add_font("Roboto", fname="fonts/Roboto-Regular.ttf", uni=True)
|
||||
pdf.set_font("Roboto", size=12)
|
||||
|
||||
usable_w = pdf.w - 2 * pdf.l_margin
|
||||
width = usable_w / 7
|
||||
hight = pdf.font_size * 2
|
||||
LINE_HIGHT = 5
|
||||
|
||||
columns = [PRICE, FLOOR, ROOM_AMOUNT, SERIES, QUADRATURE, PUB_DATE]
|
||||
|
||||
for column in columns:
|
||||
if column == PUB_DATE:
|
||||
col_width = width * 2
|
||||
else:
|
||||
col_width = width
|
||||
pdf.cell(col_width, hight, column, border=1)
|
||||
|
||||
pdf.ln(hight)
|
||||
pdf.set_font()
|
||||
for _ in range(5):
|
||||
rand_num = randint(2, len(data) - 10)
|
||||
for column in columns:
|
||||
if column == PUB_DATE:
|
||||
col_width = width * 2
|
||||
else:
|
||||
col_width = width
|
||||
pdf.cell(col_width, hight, str(data[column][rand_num]), border=1)
|
||||
pdf.ln(hight)
|
||||
|
||||
text = """
|
||||
"Price to floor" grafiks - lielākā daļa pārdodamo dzīvokļu ir līdz 5. stāvam.
|
||||
"Price to room amount" grafiks - jo mazāk istabu, jo lētāks dzīvoklis.
|
||||
"Price to quadrature" grafiks - jo lielāka dzīvokļa platība, jo dārgāks dzīvoklis.
|
||||
"Price to series" grafiks - dārgākie dzīvokļi ir jaunie.
|
||||
"Price to date" grafiks - nesen pārdošanā ielikto dzīvokļu ir vairāk.
|
||||
"""
|
||||
pdf.ln(hight)
|
||||
pdf.image(f"{output_path}/korelacija.png", w=usable_w)
|
||||
# pdf.write(LINE_HIGHT, "Starp istabu skaitu un cenu, kvadratūru un cenu ir liela korelācija.")
|
||||
pdf.image(f"{output_path}/cenu_grafiki.png", w=usable_w)
|
||||
|
||||
for txt in text.split("\n"):
|
||||
pdf.write(LINE_HIGHT, txt.strip())
|
||||
pdf.ln(LINE_HIGHT)
|
||||
|
||||
average = calc_average(data)
|
||||
for key, value in average.items():
|
||||
if not isinstance(value, str):
|
||||
value = str(round(value))
|
||||
pdf.write(LINE_HIGHT, f"{key} - {value}")
|
||||
pdf.ln(LINE_HIGHT)
|
||||
|
||||
response = requests.get(series_photos[average[SERIES]])
|
||||
img = Image.open(BytesIO(response.content))
|
||||
pdf.image(img)
|
||||
pdf.output("output/pdf.pdf")
|
||||
|
||||
|
||||
def calc_average(data):
|
||||
columns = [FLOOR, ROOM_AMOUNT, SERIES, QUADRATURE]
|
||||
mean_price_columns = {FLOOR: None, ROOM_AMOUNT: None, SERIES: None, QUADRATURE: None}
|
||||
for column in columns:
|
||||
if column == SERIES:
|
||||
# print(data[column])
|
||||
# print(f"{column} = {mode(data[column])}")
|
||||
mean_price_columns[column] = (mode(data[SERIES]))
|
||||
else:
|
||||
# print(f"{column} = {mode(data[column])}")
|
||||
mean_price_columns[column] = mode(data[PRICE]) / mode(data[column])
|
||||
return mean_price_columns
|
||||
|
||||
|
||||
def graph_plot():
|
||||
data = read()
|
||||
graph_corr(data)
|
||||
graph_price(data)
|
||||
create_pdf(data)
|
||||
|
||||
|
||||
def main():
|
||||
graph_plot()
|
||||
|
||||
|
||||
@ -14,6 +14,7 @@ HEADERS = {
|
||||
|
||||
|
||||
class SS:
|
||||
|
||||
def __init__(self, url, name):
|
||||
self.url = url
|
||||
self.name = name
|
||||
@ -77,24 +78,20 @@ class SS:
|
||||
chunked_items_list = [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)] # combines each 'chunk_size' elements into array
|
||||
columns = ["Atrašanās vieta", "Istabu skaits", "Kvadratūra", "Stāvs", "Sērija", "Cena", "Pilns sludinājuma teksts", "Izvietošanas datums"]
|
||||
df = pd.DataFrame(chunked_items_list, columns=columns)
|
||||
time = datetime.now().strftime("%d%m%Y%H%M%S")
|
||||
df.to_excel(excel_writer=f"output/excel/output_{self.name}_{time}.xlsx", index=False)
|
||||
time = datetime.now().strftime("%d%m%y%H%M%S") # current time
|
||||
df.to_excel(excel_writer=f"output/excel/ss_{self.name}_{time}.xlsx", index=False)
|
||||
print("Done")
|
||||
|
||||
|
||||
flats_many = SS("https://www.ss.com/lv/real-estate/flats/riga/all/sell/", "many")
|
||||
flats_few = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/", "few")
|
||||
flats_riga = SS("https://www.ss.com/lv/real-estate/flats/riga/all/sell/", "riga")
|
||||
flats_rigareg = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/", "rigareg")
|
||||
flats_aizkraukle = SS("https://www.ss.com/lv/real-estate/flats/aizkraukle-and-reg/sell/", "aizkraukle")
|
||||
flats_tukums = SS("https://www.ss.com/lv/real-estate/flats/tukums-and-reg/sell/", "tukums")
|
||||
flats_ogre = SS("https://www.ss.com/lv/real-estate/flats/ogre-and-reg/sell/", "ogre")
|
||||
|
||||
|
||||
def main():
|
||||
# flats_aizkraukle.get_data()
|
||||
# flats_tukums.get_data()
|
||||
# flats_ogre.get_data()
|
||||
# flats_few.get_data()
|
||||
flats_many.get_data()
|
||||
flats_riga.get_data()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@ -1,5 +1,2 @@
|
||||
from datetime import datetime
|
||||
|
||||
time = datetime.now().strftime("%d%m%Y%H%M%S")
|
||||
|
||||
print(time)
|
||||
val = "5.00"
|
||||
print(int((float(val))))
|
||||
Loading…
Reference in New Issue
Block a user