diff --git a/february/task_180222/output/excel/ss_riga_210222131848.xlsx b/february/task_180222/output/excel/ss_riga_210222131848.xlsx new file mode 100644 index 00000000..2f369f64 Binary files /dev/null and b/february/task_180222/output/excel/ss_riga_210222131848.xlsx differ diff --git a/february/task_180222/output/graphs/cenu_grafiki.png b/february/task_180222/output/graphs/cenu_grafiki.png index 1630af7f..17aebd64 100644 Binary files a/february/task_180222/output/graphs/cenu_grafiki.png and b/february/task_180222/output/graphs/cenu_grafiki.png differ diff --git a/february/task_180222/output/graphs/korelacija.png b/february/task_180222/output/graphs/korelacija.png index 884f7af8..f9f072ec 100644 Binary files a/february/task_180222/output/graphs/korelacija.png and b/february/task_180222/output/graphs/korelacija.png differ diff --git a/february/task_180222/output/pdf.pdf b/february/task_180222/output/pdf.pdf index d5fd86e4..0ff55eab 100644 Binary files a/february/task_180222/output/pdf.pdf and b/february/task_180222/output/pdf.pdf differ diff --git a/february/task_180222/pd_pandas_k_f_cagulis.py b/february/task_180222/pd_pandas_k_f_cagulis.py index 9b74edba..1a9ce7e7 100644 --- a/february/task_180222/pd_pandas_k_f_cagulis.py +++ b/february/task_180222/pd_pandas_k_f_cagulis.py @@ -6,6 +6,7 @@ import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import requests +import sys from pathlib import Path from random import randint from fpdf import FPDF @@ -14,8 +15,6 @@ from PIL import Image from io import BytesIO from ss_scraper import SS -# flats_few = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/") -# flats_few.get_data() output_path = "output/graphs" all_df = [] @@ -26,6 +25,8 @@ SERIES = "Sērija" ROOM_AMOUNT = "Istabu skaits" PUB_DATE = "Izvietošanas datums" +COLUMNS = [PRICE, FLOOR, ROOM_AMOUNT, SERIES, QUADRATURE, PUB_DATE] + series_photos = { "103.": "https://i.ss.com/gallery/5/902/225301/45060087.th2.jpg", "104.": "https://i.ss.com/gallery/5/888/221910/44381841.th2.jpg", @@ -46,7 +47,6 @@ series_photos = { class priceGraphs: - def __init__(self, data, pos, x_value, title, xlabel, y_value=PRICE, ylabel="Price"): self.pos = pos self.x_value = data[x_value] @@ -81,7 +81,7 @@ def read(): df_combined = df_combined.replace(value, replace_value(value, " ", ",", "")) for _ in df_combined[ROOM_AMOUNT]: - df_combined = df_combined.replace(["citi", "Citi"], "2") + df_combined = df_combined.replace(["citi", "Citi"], "7") try: for value in df_combined[ROOM_AMOUNT]: df_combined = df_combined.replace(value, int(value)) @@ -111,7 +111,6 @@ def graph_corr(data): sns.heatmap(data_corr.corr()) plt.savefig(f"{output_path}/korelacija.png") - calc_average(data_corr) def graph_price(data): @@ -141,29 +140,27 @@ def create_pdf(data): usable_w = pdf.w - 2 * pdf.l_margin width = usable_w / 7 - hight = pdf.font_size * 2 - LINE_HIGHT = 5 + height = pdf.font_size * 2 + LINE_HEIGHT = 5 - columns = [PRICE, FLOOR, ROOM_AMOUNT, SERIES, QUADRATURE, PUB_DATE] - - for column in columns: + for column in COLUMNS: if column == PUB_DATE: col_width = width * 2 else: col_width = width - pdf.cell(col_width, hight, column, border=1) + pdf.cell(col_width, height, column, border=1) - pdf.ln(hight) - pdf.set_font() + pdf.ln(height) for _ in range(5): - rand_num = randint(2, len(data) - 10) - for column in columns: + rand_num = randint(2, len(data)) + # print(str(data[column].iloc[rand_num])) # TODO: ERROR + for column in COLUMNS: if column == PUB_DATE: col_width = width * 2 else: col_width = width - pdf.cell(col_width, hight, str(data[column][rand_num]), border=1) - pdf.ln(hight) + pdf.cell(col_width, height, str(data[column].iloc[rand_num]), border=1) + pdf.ln(height) text = """ "Price to floor" grafiks - lielākā daļa pārdodamo dzīvokļu ir līdz 5. stāvam. @@ -172,40 +169,41 @@ def create_pdf(data): "Price to series" grafiks - dārgākie dzīvokļi ir jaunie. "Price to date" grafiks - nesen pārdošanā ielikto dzīvokļu ir vairāk. """ - pdf.ln(hight) + pdf.ln(height) pdf.image(f"{output_path}/korelacija.png", w=usable_w) - # pdf.write(LINE_HIGHT, "Starp istabu skaitu un cenu, kvadratūru un cenu ir liela korelācija.") + # pdf.write(LINE_HEIGHT, "Starp istabu skaitu un cenu, kvadratūru un cenu ir liela korelācija.") pdf.image(f"{output_path}/cenu_grafiki.png", w=usable_w) for txt in text.split("\n"): - pdf.write(LINE_HIGHT, txt.strip()) - pdf.ln(LINE_HIGHT) + pdf.write(LINE_HEIGHT, txt.strip()) + pdf.ln(LINE_HEIGHT) - average = calc_average(data) + average = calc_mode(data) + # print(average) for key, value in average.items(): - if not isinstance(value, str): - value = str(round(value)) - pdf.write(LINE_HIGHT, f"{key} - {value}") - pdf.ln(LINE_HIGHT) + print(f"{key} - {value}") + # if not isinstance(value, str): + # value = str(round(value)) + pdf.write(LINE_HEIGHT, f"{key} - {value}") + pdf.ln(LINE_HEIGHT) - response = requests.get(series_photos[average[SERIES]]) - img = Image.open(BytesIO(response.content)) - pdf.image(img) + # response = requests.get(series_photos[average[SERIES]]) + # img = Image.open(BytesIO(response.content)) + # pdf.image(img) pdf.output("output/pdf.pdf") -def calc_average(data): - columns = [FLOOR, ROOM_AMOUNT, SERIES, QUADRATURE] - mean_price_columns = {FLOOR: None, ROOM_AMOUNT: None, SERIES: None, QUADRATURE: None} - for column in columns: - if column == SERIES: - # print(data[column]) - # print(f"{column} = {mode(data[column])}") - mean_price_columns[column] = (mode(data[SERIES])) - else: - # print(f"{column} = {mode(data[column])}") - mean_price_columns[column] = mode(data[PRICE]) / mode(data[column]) - return mean_price_columns +def calc_mode(data): + mode_columns = {} + for column in COLUMNS: + mode_columns[column] = (mode(data[column])) + # if column == SERIES: + # print(data[column]) + # print(f"{column} = {mode(data[column])}") + # else: + # print(f"{column} = {mode(data[column])}") + # mean_price_columns[column] = mode(data[PRICE]) / mode(data[column]) + return mode_columns def graph_plot(): @@ -215,9 +213,23 @@ def graph_plot(): create_pdf(data) -def main(): +flats_riga = SS("https://www.ss.com/lv/real-estate/flats/riga/all/sell/", "riga") +flats_rigareg = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/", "rigareg") +flats_aizkraukle = SS("https://www.ss.com/lv/real-estate/flats/aizkraukle-and-reg/sell/", "aizkraukle") +flats_tukums = SS("https://www.ss.com/lv/real-estate/flats/tukums-and-reg/sell/", "tukums") +flats_ogre = SS("https://www.ss.com/lv/real-estate/flats/ogre-and-reg/sell/", "ogre") + + +def main(argv): + for arg in argv: + if arg == "-h" or arg == "--help": + print(f"{__file__} -N --new Scrape new file") + exit() + elif arg == "-n" or arg == "--new": + flats_riga.get_data() + # flats_ogre.get_data() graph_plot() if __name__ == "__main__": - main() \ No newline at end of file + main(sys.argv[1:]) \ No newline at end of file diff --git a/february/task_180222/requirements.txt b/february/task_180222/requirements.txt new file mode 100644 index 00000000..4506a0a0 --- /dev/null +++ b/february/task_180222/requirements.txt @@ -0,0 +1,29 @@ +beautifulsoup4==4.10.0 +bs4==0.0.1 +certifi==2021.10.8 +charset-normalizer==2.0.12 +cycler==0.11.0 +et-xmlfile==1.1.0 +fonttools==4.29.1 +fpdf2==2.5.0 +idna==3.3 +kiwisolver==1.3.2 +load-bar==0.0.7 +matplotlib==3.5.1 +numpy==1.22.2 +openpyxl==3.0.9 +packaging==21.3 +pandas==1.4.1 +Pillow==9.0.1 +progressbar2==4.0.0 +pyparsing==3.0.7 +python-dateutil==2.8.2 +python-utils==3.1.0 +pytz==2021.3 +requests==2.27.1 +scipy==1.8.0 +seaborn==0.11.2 +six==1.16.0 +soupsieve==2.3.1 +termcolor==1.1.0 +urllib3==1.26.8 diff --git a/february/task_180222/ss_scraper.py b/february/task_180222/ss_scraper.py index bd2d55c3..4fb80010 100644 --- a/february/task_180222/ss_scraper.py +++ b/february/task_180222/ss_scraper.py @@ -5,6 +5,8 @@ from bs4 import BeautifulSoup import requests import pandas as pd +# import progressbar as pbar +from loadbar import LoadBar from datetime import datetime HEADERS = { @@ -14,7 +16,6 @@ HEADERS = { class SS: - def __init__(self, url, name): self.url = url self.name = name @@ -28,16 +29,22 @@ class SS: page_amount = last_url[last_url.find("page") + 4:last_url.find(".html")] except: page_amount = 1 - print(f"Page amount = {page_amount}") + # print(f"Page amount = {page_amount}") return int(page_amount) def get_data(self): items = [] item_no = 1 - for page_number in range(1, self._get_page_amount() + 1): - url = self.url + f"/page{page_number}.html" + page_amount = self._get_page_amount() + # widgets = ["Getting data...", pbar.Bar("*")] + # bar = pbar.ProgressBar(max_value=page_amount, widgets=widgets).start() + bar = LoadBar(max=page_amount * 30, head="#", body="#") + bar.start() + for page_number in range(1, page_amount + 1): + + url = self.url + f"/page{page_number}.html" page = requests.get(url, headers=HEADERS) soup = BeautifulSoup(page.content, 'html.parser') @@ -45,11 +52,13 @@ class SS: ids = [tag['id'] for tag in soup.select('tr[id]')] # creates list with ids ids = [x for x in ids if "tr_bnr" not in x] # removes "tr_bnr" elements from list ids.remove("head_line") # removes first "head_line" id - print(f"Page {page_number}") + # print(f"Page {page_number}") # getting item data for id in soup.find_all(id=ids): - print(f"Item {item_no}") + # print(f"Item {item_no}") + bar.update(step=item_no) + item_no += 1 for elem in id.find_all(class_='msga2-o pp6'): @@ -73,14 +82,13 @@ class SS: item_date = item_soup.find_all('td', class_='msg_footer') # gets all 'msg_footer' class' item_date = item_date[2].get_text() # extracts 3rd element items.append(item_date[8:18]) # crops date - + bar.end() chunk_size = 8 chunked_items_list = [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)] # combines each 'chunk_size' elements into array columns = ["Atrašanās vieta", "Istabu skaits", "Kvadratūra", "Stāvs", "Sērija", "Cena", "Pilns sludinājuma teksts", "Izvietošanas datums"] df = pd.DataFrame(chunked_items_list, columns=columns) time = datetime.now().strftime("%d%m%y%H%M%S") # current time df.to_excel(excel_writer=f"output/excel/ss_{self.name}_{time}.xlsx", index=False) - print("Done") flats_riga = SS("https://www.ss.com/lv/real-estate/flats/riga/all/sell/", "riga") @@ -92,6 +100,7 @@ flats_ogre = SS("https://www.ss.com/lv/real-estate/flats/ogre-and-reg/sell/", "o def main(): flats_riga.get_data() + # flats_rigareg.get_data() if __name__ == '__main__':