diff --git a/.gitignore b/.gitignore index dd6fc383..f9298454 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ **/.venv/ **.log /december/task_081221/files -/december/task_081221/*.log \ No newline at end of file +/december/task_081221/*.log +**.plk \ No newline at end of file diff --git a/february/task_180222/Roboto-Regular.pkl b/february/task_180222/Roboto-Regular.pkl new file mode 100644 index 00000000..dc0a03a5 Binary files /dev/null and b/february/task_180222/Roboto-Regular.pkl differ diff --git a/february/task_180222/fonts/Roboto-Regular.ttf b/february/task_180222/fonts/Roboto-Regular.ttf new file mode 100644 index 00000000..3d6861b4 Binary files /dev/null and b/february/task_180222/fonts/Roboto-Regular.ttf differ diff --git a/february/task_180222/output/excel/output_aizkraukle.xlsx b/february/task_180222/output/excel/output_aizkraukle.xlsx deleted file mode 100644 index f0356bcb..00000000 Binary files a/february/task_180222/output/excel/output_aizkraukle.xlsx and /dev/null differ diff --git a/february/task_180222/output/excel/output_few.xlsx b/february/task_180222/output/excel/output_few.xlsx deleted file mode 100644 index 47427d77..00000000 Binary files a/february/task_180222/output/excel/output_few.xlsx and /dev/null differ diff --git a/february/task_180222/output/excel/output_many_20022022124051.xlsx b/february/task_180222/output/excel/output_many_20022022124051.xlsx deleted file mode 100644 index 5dc3479d..00000000 Binary files a/february/task_180222/output/excel/output_many_20022022124051.xlsx and /dev/null differ diff --git a/february/task_180222/output/excel/output_ogre.xlsx b/february/task_180222/output/excel/output_ogre.xlsx deleted file mode 100644 index 6a5df2be..00000000 Binary files a/february/task_180222/output/excel/output_ogre.xlsx and /dev/null differ diff --git a/february/task_180222/output/excel/output_tukums.xlsx b/february/task_180222/output/excel/output_tukums.xlsx deleted file mode 100644 index 2d38cad1..00000000 Binary files a/february/task_180222/output/excel/output_tukums.xlsx and /dev/null differ diff --git a/output_many.xlsx b/february/task_180222/output/excel/ss_riga_180222102341.xlsx similarity index 100% rename from output_many.xlsx rename to february/task_180222/output/excel/ss_riga_180222102341.xlsx diff --git a/february/task_180222/output/excel/ss_riga_190222124051.xlsx b/february/task_180222/output/excel/ss_riga_190222124051.xlsx new file mode 100644 index 00000000..503b803a Binary files /dev/null and b/february/task_180222/output/excel/ss_riga_190222124051.xlsx differ diff --git a/february/task_180222/output/excel/ss_riga_210222000729.xlsx b/february/task_180222/output/excel/ss_riga_210222000729.xlsx new file mode 100644 index 00000000..982412f9 Binary files /dev/null and b/february/task_180222/output/excel/ss_riga_210222000729.xlsx differ diff --git a/february/task_180222/output/graphs/cenu_grafiki.png b/february/task_180222/output/graphs/cenu_grafiki.png index b53731de..1630af7f 100644 Binary files a/february/task_180222/output/graphs/cenu_grafiki.png and b/february/task_180222/output/graphs/cenu_grafiki.png differ diff --git a/february/task_180222/output/graphs/korelacija.png b/february/task_180222/output/graphs/korelacija.png index 25b77170..884f7af8 100644 Binary files a/february/task_180222/output/graphs/korelacija.png and b/february/task_180222/output/graphs/korelacija.png differ diff --git a/february/task_180222/output/pdf.pdf b/february/task_180222/output/pdf.pdf new file mode 100644 index 00000000..d5fd86e4 Binary files /dev/null and b/february/task_180222/output/pdf.pdf differ diff --git a/february/task_180222/pd_pandas_k_f_cagulis.py b/february/task_180222/pd_pandas_k_f_cagulis.py index fe9056b4..9b74edba 100644 --- a/february/task_180222/pd_pandas_k_f_cagulis.py +++ b/february/task_180222/pd_pandas_k_f_cagulis.py @@ -2,10 +2,16 @@ # Date - 17.02.2022. # Title - Patstāvīgais darbs - pandas -from pathlib import Path import pandas as pd import seaborn as sns import matplotlib.pyplot as plt +import requests +from pathlib import Path +from random import randint +from fpdf import FPDF +from statistics import mode +from PIL import Image +from io import BytesIO from ss_scraper import SS # flats_few = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/") @@ -13,105 +19,202 @@ from ss_scraper import SS output_path = "output/graphs" all_df = [] +QUADRATURE = "Kvadratūra" +FLOOR = "Stāvs" +PRICE = "Cena" +SERIES = "Sērija" +ROOM_AMOUNT = "Istabu skaits" +PUB_DATE = "Izvietošanas datums" -def read(path): - df = pd.read_excel(path) - all_df.append(df) +series_photos = { + "103.": "https://i.ss.com/gallery/5/902/225301/45060087.th2.jpg", + "104.": "https://i.ss.com/gallery/5/888/221910/44381841.th2.jpg", + "119.": "https://i.ss.com/gallery/5/902/225443/45088567.th2.jpg", + "467.": "https://i.ss.com/gallery/5/892/222881/44576186.th2.jpg", + "602.": "https://i.ss.com/gallery/5/896/223820/44763891.th2.jpg", + "Čehu pr.": "https://i.ss.com/gallery/5/902/225358/45071499.th2.jpg", + "Hrušč.": "https://i.ss.com/gallery/5/896/223961/44792152.th2.jpg", + "LT proj.": "https://i.ss.com/gallery/5/873/218203/43640498.th2.jpg", + "M. ģim.": "https://i.ss.com/gallery/5/871/217506/43501012.th2.jpg", + "P. kara": "https://i.ss.com/gallery/5/902/225490/45097851.th2.jpg", + "Priv. m.": "https://i.ss.com/gallery/5/895/223697/44739240.th2.jpg", + "Renov.": "https://i.ss.com/gallery/5/902/225442/45088303.th2.jpg", + "Specpr.": "https://i.ss.com/gallery/5/902/225492/45098378.th2.jpg", + "Staļina": "https://i.ss.com/gallery/5/902/225440/45087952.th2.jpg", + "Jaun.": "https://i.ss.com/gallery/5/902/225456/45091154.th2.jpg" +} -def get_data(): +class priceGraphs: + + def __init__(self, data, pos, x_value, title, xlabel, y_value=PRICE, ylabel="Price"): + self.pos = pos + self.x_value = data[x_value] + self.y_value = data[y_value] + self.title = title + self.xlabel = xlabel + self.ylabel = ylabel + + def _graph_price(self): + plot = plt.subplot2grid((3, 2), self.pos) + plot.scatter(self.x_value, self.y_value) + plot.set_title(self.title) + plot.set_xlabel(self.xlabel) + plot.set_ylabel(self.ylabel) + + +def read(): files = list(Path(Path(__file__).parent.absolute()).glob("**/*.xlsx")) - for file in files: - read(file) - df_out = pd.concat(all_df).reset_index(drop=True) - # df_out.to_excel("output/excel/combined.xlsx", index=False) + for file_path in files: + all_df.append(pd.read_excel(file_path)) + df_combined = pd.concat(all_df).reset_index(drop=True) + df_combined.sort_values(by=[PRICE, PUB_DATE], inplace=True) + df_combined.drop_duplicates(subset="Pilns sludinājuma teksts", keep=False, inplace=True) # replaces floor value to intiger - for value in df_out["Stāvs"]: - df_out = df_out.replace(value, int(value[:value.find("/")])) + for value in df_combined[FLOOR]: + df_combined = df_combined.replace(value, int(float(value[:value.find("/")]))) # replaces price value to intiger - for value in df_out["Cena"]: - df_out = df_out.replace(value, replace_value(value)) - return df_out.sort_values(by="Cena") + for value in df_combined[PRICE]: + df_combined = df_combined.replace(value, replace_value(value, " ", ",", "")) + + for _ in df_combined[ROOM_AMOUNT]: + df_combined = df_combined.replace(["citi", "Citi"], "2") + try: + for value in df_combined[ROOM_AMOUNT]: + df_combined = df_combined.replace(value, int(value)) + except: + pass + # converts to datetime + df_combined[PUB_DATE] = pd.to_datetime(df_combined[PUB_DATE], format="%d.%m.%Y") + + # df_combined.to_excel("output/excel/combined.xlsx", index=False) + return df_combined.sort_values(by=[PRICE, PUB_DATE]) -def replace_value(value): - new_value = value[:value.find(" ")] - new_value = new_value.replace(",", "") - return int(new_value) - - -def graph_plot(): - data = get_data() - graph_corr(data) - graph_price(data) +# replace value +replace_value = lambda value, find, replace, replace_to: int(value[:value.find(find)].replace(replace, replace_to)) def graph_corr(data): data_corr = data.copy() - + plt.rc("font", size=8) + # gets all series series = [] - for i in data_corr["Sērija"]: + for i in data_corr[SERIES]: if i not in series: series.append(i) - j = 0 - for s in series: - data_corr = list(map(lambda x: x.replace(s, j), data_corr)) - j += 1 + # change series names to numbers + data_corr[SERIES] = data_corr[SERIES].replace(series, range(len(series))) - print(data_corr["Sērija"]) sns.heatmap(data_corr.corr()) plt.savefig(f"{output_path}/korelacija.png") + calc_average(data_corr) def graph_price(data): - # plot settings plt.figure(figsize=(50, 30)) plt.rc("font", size=15) - # plt.rc("font", titlesize=24) - # placing the plots in the plane - plot1 = plt.subplot2grid((3, 2), (0, 0)) - plot2 = plt.subplot2grid((3, 2), (0, 1)) - plot3 = plt.subplot2grid((3, 2), (1, 0)) - plot4 = plt.subplot2grid((3, 2), (1, 1)) - plot5 = plt.subplot2grid((3, 2), (2, 0)) + plot1 = priceGraphs(data, (0, 0), FLOOR, "Price to floor", "Floor") + plot2 = priceGraphs(data, (0, 1), ROOM_AMOUNT, "Price to room amount", "Room amount") + plot3 = priceGraphs(data, (1, 0), QUADRATURE, "Price to quadrature", "Quadrature") + plot4 = priceGraphs(data, (1, 1), SERIES, "Price to series", "Series") + plot5 = priceGraphs(data, (2, 0), PUB_DATE, "Price to date", "Date") - # floor to price - plot1.scatter(data["Cena"], data["Stāvs"]) - plot1.set_title("Floor to price") - plot1.set_xlabel("Price") - plot1.set_ylabel("Floor") - - # room amount to price - plot2.scatter(data["Cena"], data["Istabu skaits"]) - plot2.set_title("Room amount to price") - plot2.set_xlabel("Price") - plot2.set_ylabel("Room amount") - - # quadrature to price - plot3.scatter(data["Cena"], data["Kvadratūra"]) - plot3.set_title("Quadrature to price") - plot3.set_xlabel("Price") - plot3.set_ylabel("Quadrature") - - # series to price - plot4.scatter(data["Cena"], data["Sērija"]) - plot4.set_title("Series to price") - plot4.set_xlabel("Price") - plot4.set_ylabel("Series") - - # date to price - plot5.scatter(data["Cena"], data["Izvietošanas datums"]) - plot5.set_title("Date to price") - plot5.set_xlabel("Price") - plot5.set_ylabel("Date") + plot1._graph_price() + plot2._graph_price() + plot3._graph_price() + plot4._graph_price() + plot5._graph_price() plt.savefig(f"{output_path}/cenu_grafiki.png") +def create_pdf(data): + pdf = FPDF("P", "mm", "A4") + pdf.add_page() + pdf.add_font("Roboto", fname="fonts/Roboto-Regular.ttf", uni=True) + pdf.set_font("Roboto", size=12) + + usable_w = pdf.w - 2 * pdf.l_margin + width = usable_w / 7 + hight = pdf.font_size * 2 + LINE_HIGHT = 5 + + columns = [PRICE, FLOOR, ROOM_AMOUNT, SERIES, QUADRATURE, PUB_DATE] + + for column in columns: + if column == PUB_DATE: + col_width = width * 2 + else: + col_width = width + pdf.cell(col_width, hight, column, border=1) + + pdf.ln(hight) + pdf.set_font() + for _ in range(5): + rand_num = randint(2, len(data) - 10) + for column in columns: + if column == PUB_DATE: + col_width = width * 2 + else: + col_width = width + pdf.cell(col_width, hight, str(data[column][rand_num]), border=1) + pdf.ln(hight) + + text = """ + "Price to floor" grafiks - lielākā daļa pārdodamo dzīvokļu ir līdz 5. stāvam. + "Price to room amount" grafiks - jo mazāk istabu, jo lētāks dzīvoklis. + "Price to quadrature" grafiks - jo lielāka dzīvokļa platība, jo dārgāks dzīvoklis. + "Price to series" grafiks - dārgākie dzīvokļi ir jaunie. + "Price to date" grafiks - nesen pārdošanā ielikto dzīvokļu ir vairāk. + """ + pdf.ln(hight) + pdf.image(f"{output_path}/korelacija.png", w=usable_w) + # pdf.write(LINE_HIGHT, "Starp istabu skaitu un cenu, kvadratūru un cenu ir liela korelācija.") + pdf.image(f"{output_path}/cenu_grafiki.png", w=usable_w) + + for txt in text.split("\n"): + pdf.write(LINE_HIGHT, txt.strip()) + pdf.ln(LINE_HIGHT) + + average = calc_average(data) + for key, value in average.items(): + if not isinstance(value, str): + value = str(round(value)) + pdf.write(LINE_HIGHT, f"{key} - {value}") + pdf.ln(LINE_HIGHT) + + response = requests.get(series_photos[average[SERIES]]) + img = Image.open(BytesIO(response.content)) + pdf.image(img) + pdf.output("output/pdf.pdf") + + +def calc_average(data): + columns = [FLOOR, ROOM_AMOUNT, SERIES, QUADRATURE] + mean_price_columns = {FLOOR: None, ROOM_AMOUNT: None, SERIES: None, QUADRATURE: None} + for column in columns: + if column == SERIES: + # print(data[column]) + # print(f"{column} = {mode(data[column])}") + mean_price_columns[column] = (mode(data[SERIES])) + else: + # print(f"{column} = {mode(data[column])}") + mean_price_columns[column] = mode(data[PRICE]) / mode(data[column]) + return mean_price_columns + + +def graph_plot(): + data = read() + graph_corr(data) + graph_price(data) + create_pdf(data) + + def main(): graph_plot() diff --git a/february/task_180222/ss_scraper.py b/february/task_180222/ss_scraper.py index 59d760c0..bd2d55c3 100644 --- a/february/task_180222/ss_scraper.py +++ b/february/task_180222/ss_scraper.py @@ -14,6 +14,7 @@ HEADERS = { class SS: + def __init__(self, url, name): self.url = url self.name = name @@ -77,24 +78,20 @@ class SS: chunked_items_list = [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)] # combines each 'chunk_size' elements into array columns = ["Atrašanās vieta", "Istabu skaits", "Kvadratūra", "Stāvs", "Sērija", "Cena", "Pilns sludinājuma teksts", "Izvietošanas datums"] df = pd.DataFrame(chunked_items_list, columns=columns) - time = datetime.now().strftime("%d%m%Y%H%M%S") - df.to_excel(excel_writer=f"output/excel/output_{self.name}_{time}.xlsx", index=False) + time = datetime.now().strftime("%d%m%y%H%M%S") # current time + df.to_excel(excel_writer=f"output/excel/ss_{self.name}_{time}.xlsx", index=False) print("Done") -flats_many = SS("https://www.ss.com/lv/real-estate/flats/riga/all/sell/", "many") -flats_few = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/", "few") +flats_riga = SS("https://www.ss.com/lv/real-estate/flats/riga/all/sell/", "riga") +flats_rigareg = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/", "rigareg") flats_aizkraukle = SS("https://www.ss.com/lv/real-estate/flats/aizkraukle-and-reg/sell/", "aizkraukle") flats_tukums = SS("https://www.ss.com/lv/real-estate/flats/tukums-and-reg/sell/", "tukums") flats_ogre = SS("https://www.ss.com/lv/real-estate/flats/ogre-and-reg/sell/", "ogre") def main(): - # flats_aizkraukle.get_data() - # flats_tukums.get_data() - # flats_ogre.get_data() - # flats_few.get_data() - flats_many.get_data() + flats_riga.get_data() if __name__ == '__main__': diff --git a/february/task_180222/test.py b/february/task_180222/test.py index 6e645ed9..9101bebb 100644 --- a/february/task_180222/test.py +++ b/february/task_180222/test.py @@ -1,5 +1,2 @@ -from datetime import datetime - -time = datetime.now().strftime("%d%m%Y%H%M%S") - -print(time) \ No newline at end of file +val = "5.00" +print(int((float(val)))) \ No newline at end of file