task_180222

This commit is contained in:
Kristofers-Solo 2022-02-21 16:34:52 +02:00
parent 2489e585e5
commit b3fc9f2cc3
7 changed files with 101 additions and 51 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 258 KiB

After

Width:  |  Height:  |  Size: 238 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 14 KiB

After

Width:  |  Height:  |  Size: 16 KiB

Binary file not shown.

View File

@ -6,6 +6,7 @@ import pandas as pd
import seaborn as sns import seaborn as sns
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import requests import requests
import sys
from pathlib import Path from pathlib import Path
from random import randint from random import randint
from fpdf import FPDF from fpdf import FPDF
@ -14,8 +15,6 @@ from PIL import Image
from io import BytesIO from io import BytesIO
from ss_scraper import SS from ss_scraper import SS
# flats_few = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/")
# flats_few.get_data()
output_path = "output/graphs" output_path = "output/graphs"
all_df = [] all_df = []
@ -26,6 +25,8 @@ SERIES = "Sērija"
ROOM_AMOUNT = "Istabu skaits" ROOM_AMOUNT = "Istabu skaits"
PUB_DATE = "Izvietošanas datums" PUB_DATE = "Izvietošanas datums"
COLUMNS = [PRICE, FLOOR, ROOM_AMOUNT, SERIES, QUADRATURE, PUB_DATE]
series_photos = { series_photos = {
"103.": "https://i.ss.com/gallery/5/902/225301/45060087.th2.jpg", "103.": "https://i.ss.com/gallery/5/902/225301/45060087.th2.jpg",
"104.": "https://i.ss.com/gallery/5/888/221910/44381841.th2.jpg", "104.": "https://i.ss.com/gallery/5/888/221910/44381841.th2.jpg",
@ -46,7 +47,6 @@ series_photos = {
class priceGraphs: class priceGraphs:
def __init__(self, data, pos, x_value, title, xlabel, y_value=PRICE, ylabel="Price"): def __init__(self, data, pos, x_value, title, xlabel, y_value=PRICE, ylabel="Price"):
self.pos = pos self.pos = pos
self.x_value = data[x_value] self.x_value = data[x_value]
@ -81,7 +81,7 @@ def read():
df_combined = df_combined.replace(value, replace_value(value, " ", ",", "")) df_combined = df_combined.replace(value, replace_value(value, " ", ",", ""))
for _ in df_combined[ROOM_AMOUNT]: for _ in df_combined[ROOM_AMOUNT]:
df_combined = df_combined.replace(["citi", "Citi"], "2") df_combined = df_combined.replace(["citi", "Citi"], "7")
try: try:
for value in df_combined[ROOM_AMOUNT]: for value in df_combined[ROOM_AMOUNT]:
df_combined = df_combined.replace(value, int(value)) df_combined = df_combined.replace(value, int(value))
@ -111,7 +111,6 @@ def graph_corr(data):
sns.heatmap(data_corr.corr()) sns.heatmap(data_corr.corr())
plt.savefig(f"{output_path}/korelacija.png") plt.savefig(f"{output_path}/korelacija.png")
calc_average(data_corr)
def graph_price(data): def graph_price(data):
@ -141,29 +140,27 @@ def create_pdf(data):
usable_w = pdf.w - 2 * pdf.l_margin usable_w = pdf.w - 2 * pdf.l_margin
width = usable_w / 7 width = usable_w / 7
hight = pdf.font_size * 2 height = pdf.font_size * 2
LINE_HIGHT = 5 LINE_HEIGHT = 5
columns = [PRICE, FLOOR, ROOM_AMOUNT, SERIES, QUADRATURE, PUB_DATE] for column in COLUMNS:
for column in columns:
if column == PUB_DATE: if column == PUB_DATE:
col_width = width * 2 col_width = width * 2
else: else:
col_width = width col_width = width
pdf.cell(col_width, hight, column, border=1) pdf.cell(col_width, height, column, border=1)
pdf.ln(hight) pdf.ln(height)
pdf.set_font()
for _ in range(5): for _ in range(5):
rand_num = randint(2, len(data) - 10) rand_num = randint(2, len(data))
for column in columns: # print(str(data[column].iloc[rand_num])) # TODO: ERROR
for column in COLUMNS:
if column == PUB_DATE: if column == PUB_DATE:
col_width = width * 2 col_width = width * 2
else: else:
col_width = width col_width = width
pdf.cell(col_width, hight, str(data[column][rand_num]), border=1) pdf.cell(col_width, height, str(data[column].iloc[rand_num]), border=1)
pdf.ln(hight) pdf.ln(height)
text = """ text = """
"Price to floor" grafiks - lielākā daļa pārdodamo dzīvokļu ir līdz 5. stāvam. "Price to floor" grafiks - lielākā daļa pārdodamo dzīvokļu ir līdz 5. stāvam.
@ -172,40 +169,41 @@ def create_pdf(data):
"Price to series" grafiks - dārgākie dzīvokļi ir jaunie. "Price to series" grafiks - dārgākie dzīvokļi ir jaunie.
"Price to date" grafiks - nesen pārdošanā ielikto dzīvokļu ir vairāk. "Price to date" grafiks - nesen pārdošanā ielikto dzīvokļu ir vairāk.
""" """
pdf.ln(hight) pdf.ln(height)
pdf.image(f"{output_path}/korelacija.png", w=usable_w) pdf.image(f"{output_path}/korelacija.png", w=usable_w)
# pdf.write(LINE_HIGHT, "Starp istabu skaitu un cenu, kvadratūru un cenu ir liela korelācija.") # pdf.write(LINE_HEIGHT, "Starp istabu skaitu un cenu, kvadratūru un cenu ir liela korelācija.")
pdf.image(f"{output_path}/cenu_grafiki.png", w=usable_w) pdf.image(f"{output_path}/cenu_grafiki.png", w=usable_w)
for txt in text.split("\n"): for txt in text.split("\n"):
pdf.write(LINE_HIGHT, txt.strip()) pdf.write(LINE_HEIGHT, txt.strip())
pdf.ln(LINE_HIGHT) pdf.ln(LINE_HEIGHT)
average = calc_average(data) average = calc_mode(data)
# print(average)
for key, value in average.items(): for key, value in average.items():
if not isinstance(value, str): print(f"{key} - {value}")
value = str(round(value)) # if not isinstance(value, str):
pdf.write(LINE_HIGHT, f"{key} - {value}") # value = str(round(value))
pdf.ln(LINE_HIGHT) pdf.write(LINE_HEIGHT, f"{key} - {value}")
pdf.ln(LINE_HEIGHT)
response = requests.get(series_photos[average[SERIES]]) # response = requests.get(series_photos[average[SERIES]])
img = Image.open(BytesIO(response.content)) # img = Image.open(BytesIO(response.content))
pdf.image(img) # pdf.image(img)
pdf.output("output/pdf.pdf") pdf.output("output/pdf.pdf")
def calc_average(data): def calc_mode(data):
columns = [FLOOR, ROOM_AMOUNT, SERIES, QUADRATURE] mode_columns = {}
mean_price_columns = {FLOOR: None, ROOM_AMOUNT: None, SERIES: None, QUADRATURE: None} for column in COLUMNS:
for column in columns: mode_columns[column] = (mode(data[column]))
if column == SERIES: # if column == SERIES:
# print(data[column]) # print(data[column])
# print(f"{column} = {mode(data[column])}") # print(f"{column} = {mode(data[column])}")
mean_price_columns[column] = (mode(data[SERIES])) # else:
else:
# print(f"{column} = {mode(data[column])}") # print(f"{column} = {mode(data[column])}")
mean_price_columns[column] = mode(data[PRICE]) / mode(data[column]) # mean_price_columns[column] = mode(data[PRICE]) / mode(data[column])
return mean_price_columns return mode_columns
def graph_plot(): def graph_plot():
@ -215,9 +213,23 @@ def graph_plot():
create_pdf(data) create_pdf(data)
def main(): flats_riga = SS("https://www.ss.com/lv/real-estate/flats/riga/all/sell/", "riga")
flats_rigareg = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/", "rigareg")
flats_aizkraukle = SS("https://www.ss.com/lv/real-estate/flats/aizkraukle-and-reg/sell/", "aizkraukle")
flats_tukums = SS("https://www.ss.com/lv/real-estate/flats/tukums-and-reg/sell/", "tukums")
flats_ogre = SS("https://www.ss.com/lv/real-estate/flats/ogre-and-reg/sell/", "ogre")
def main(argv):
for arg in argv:
if arg == "-h" or arg == "--help":
print(f"{__file__} -N --new Scrape new file")
exit()
elif arg == "-n" or arg == "--new":
flats_riga.get_data()
# flats_ogre.get_data()
graph_plot() graph_plot()
if __name__ == "__main__": if __name__ == "__main__":
main() main(sys.argv[1:])

View File

@ -0,0 +1,29 @@
beautifulsoup4==4.10.0
bs4==0.0.1
certifi==2021.10.8
charset-normalizer==2.0.12
cycler==0.11.0
et-xmlfile==1.1.0
fonttools==4.29.1
fpdf2==2.5.0
idna==3.3
kiwisolver==1.3.2
load-bar==0.0.7
matplotlib==3.5.1
numpy==1.22.2
openpyxl==3.0.9
packaging==21.3
pandas==1.4.1
Pillow==9.0.1
progressbar2==4.0.0
pyparsing==3.0.7
python-dateutil==2.8.2
python-utils==3.1.0
pytz==2021.3
requests==2.27.1
scipy==1.8.0
seaborn==0.11.2
six==1.16.0
soupsieve==2.3.1
termcolor==1.1.0
urllib3==1.26.8

View File

@ -5,6 +5,8 @@
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import requests import requests
import pandas as pd import pandas as pd
# import progressbar as pbar
from loadbar import LoadBar
from datetime import datetime from datetime import datetime
HEADERS = { HEADERS = {
@ -14,7 +16,6 @@ HEADERS = {
class SS: class SS:
def __init__(self, url, name): def __init__(self, url, name):
self.url = url self.url = url
self.name = name self.name = name
@ -28,16 +29,22 @@ class SS:
page_amount = last_url[last_url.find("page") + 4:last_url.find(".html")] page_amount = last_url[last_url.find("page") + 4:last_url.find(".html")]
except: except:
page_amount = 1 page_amount = 1
print(f"Page amount = {page_amount}") # print(f"Page amount = {page_amount}")
return int(page_amount) return int(page_amount)
def get_data(self): def get_data(self):
items = [] items = []
item_no = 1 item_no = 1
for page_number in range(1, self._get_page_amount() + 1): page_amount = self._get_page_amount()
url = self.url + f"/page{page_number}.html" # widgets = ["Getting data...", pbar.Bar("*")]
# bar = pbar.ProgressBar(max_value=page_amount, widgets=widgets).start()
bar = LoadBar(max=page_amount * 30, head="#", body="#")
bar.start()
for page_number in range(1, page_amount + 1):
url = self.url + f"/page{page_number}.html"
page = requests.get(url, headers=HEADERS) page = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(page.content, 'html.parser') soup = BeautifulSoup(page.content, 'html.parser')
@ -45,11 +52,13 @@ class SS:
ids = [tag['id'] for tag in soup.select('tr[id]')] # creates list with ids ids = [tag['id'] for tag in soup.select('tr[id]')] # creates list with ids
ids = [x for x in ids if "tr_bnr" not in x] # removes "tr_bnr" elements from list ids = [x for x in ids if "tr_bnr" not in x] # removes "tr_bnr" elements from list
ids.remove("head_line") # removes first "head_line" id ids.remove("head_line") # removes first "head_line" id
print(f"Page {page_number}") # print(f"Page {page_number}")
# getting item data # getting item data
for id in soup.find_all(id=ids): for id in soup.find_all(id=ids):
print(f"Item {item_no}") # print(f"Item {item_no}")
bar.update(step=item_no)
item_no += 1 item_no += 1
for elem in id.find_all(class_='msga2-o pp6'): for elem in id.find_all(class_='msga2-o pp6'):
@ -73,14 +82,13 @@ class SS:
item_date = item_soup.find_all('td', class_='msg_footer') # gets all 'msg_footer' class' item_date = item_soup.find_all('td', class_='msg_footer') # gets all 'msg_footer' class'
item_date = item_date[2].get_text() # extracts 3rd element item_date = item_date[2].get_text() # extracts 3rd element
items.append(item_date[8:18]) # crops date items.append(item_date[8:18]) # crops date
bar.end()
chunk_size = 8 chunk_size = 8
chunked_items_list = [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)] # combines each 'chunk_size' elements into array chunked_items_list = [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)] # combines each 'chunk_size' elements into array
columns = ["Atrašanās vieta", "Istabu skaits", "Kvadratūra", "Stāvs", "Sērija", "Cena", "Pilns sludinājuma teksts", "Izvietošanas datums"] columns = ["Atrašanās vieta", "Istabu skaits", "Kvadratūra", "Stāvs", "Sērija", "Cena", "Pilns sludinājuma teksts", "Izvietošanas datums"]
df = pd.DataFrame(chunked_items_list, columns=columns) df = pd.DataFrame(chunked_items_list, columns=columns)
time = datetime.now().strftime("%d%m%y%H%M%S") # current time time = datetime.now().strftime("%d%m%y%H%M%S") # current time
df.to_excel(excel_writer=f"output/excel/ss_{self.name}_{time}.xlsx", index=False) df.to_excel(excel_writer=f"output/excel/ss_{self.name}_{time}.xlsx", index=False)
print("Done")
flats_riga = SS("https://www.ss.com/lv/real-estate/flats/riga/all/sell/", "riga") flats_riga = SS("https://www.ss.com/lv/real-estate/flats/riga/all/sell/", "riga")
@ -92,6 +100,7 @@ flats_ogre = SS("https://www.ss.com/lv/real-estate/flats/ogre-and-reg/sell/", "o
def main(): def main():
flats_riga.get_data() flats_riga.get_data()
# flats_rigareg.get_data()
if __name__ == '__main__': if __name__ == '__main__':