task_180222

This commit is contained in:
Kristofers-Solo 2022-02-21 16:34:52 +02:00
parent 2489e585e5
commit b3fc9f2cc3
7 changed files with 101 additions and 51 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 258 KiB

After

Width:  |  Height:  |  Size: 238 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 14 KiB

After

Width:  |  Height:  |  Size: 16 KiB

Binary file not shown.

View File

@ -6,6 +6,7 @@ import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import requests
import sys
from pathlib import Path
from random import randint
from fpdf import FPDF
@ -14,8 +15,6 @@ from PIL import Image
from io import BytesIO
from ss_scraper import SS
# flats_few = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/")
# flats_few.get_data()
output_path = "output/graphs"
all_df = []
@ -26,6 +25,8 @@ SERIES = "Sērija"
ROOM_AMOUNT = "Istabu skaits"
PUB_DATE = "Izvietošanas datums"
COLUMNS = [PRICE, FLOOR, ROOM_AMOUNT, SERIES, QUADRATURE, PUB_DATE]
series_photos = {
"103.": "https://i.ss.com/gallery/5/902/225301/45060087.th2.jpg",
"104.": "https://i.ss.com/gallery/5/888/221910/44381841.th2.jpg",
@ -46,7 +47,6 @@ series_photos = {
class priceGraphs:
def __init__(self, data, pos, x_value, title, xlabel, y_value=PRICE, ylabel="Price"):
self.pos = pos
self.x_value = data[x_value]
@ -81,7 +81,7 @@ def read():
df_combined = df_combined.replace(value, replace_value(value, " ", ",", ""))
for _ in df_combined[ROOM_AMOUNT]:
df_combined = df_combined.replace(["citi", "Citi"], "2")
df_combined = df_combined.replace(["citi", "Citi"], "7")
try:
for value in df_combined[ROOM_AMOUNT]:
df_combined = df_combined.replace(value, int(value))
@ -111,7 +111,6 @@ def graph_corr(data):
sns.heatmap(data_corr.corr())
plt.savefig(f"{output_path}/korelacija.png")
calc_average(data_corr)
def graph_price(data):
@ -141,29 +140,27 @@ def create_pdf(data):
usable_w = pdf.w - 2 * pdf.l_margin
width = usable_w / 7
hight = pdf.font_size * 2
LINE_HIGHT = 5
height = pdf.font_size * 2
LINE_HEIGHT = 5
columns = [PRICE, FLOOR, ROOM_AMOUNT, SERIES, QUADRATURE, PUB_DATE]
for column in columns:
for column in COLUMNS:
if column == PUB_DATE:
col_width = width * 2
else:
col_width = width
pdf.cell(col_width, hight, column, border=1)
pdf.cell(col_width, height, column, border=1)
pdf.ln(hight)
pdf.set_font()
pdf.ln(height)
for _ in range(5):
rand_num = randint(2, len(data) - 10)
for column in columns:
rand_num = randint(2, len(data))
# print(str(data[column].iloc[rand_num])) # TODO: ERROR
for column in COLUMNS:
if column == PUB_DATE:
col_width = width * 2
else:
col_width = width
pdf.cell(col_width, hight, str(data[column][rand_num]), border=1)
pdf.ln(hight)
pdf.cell(col_width, height, str(data[column].iloc[rand_num]), border=1)
pdf.ln(height)
text = """
"Price to floor" grafiks - lielākā daļa pārdodamo dzīvokļu ir līdz 5. stāvam.
@ -172,40 +169,41 @@ def create_pdf(data):
"Price to series" grafiks - dārgākie dzīvokļi ir jaunie.
"Price to date" grafiks - nesen pārdošanā ielikto dzīvokļu ir vairāk.
"""
pdf.ln(hight)
pdf.ln(height)
pdf.image(f"{output_path}/korelacija.png", w=usable_w)
# pdf.write(LINE_HIGHT, "Starp istabu skaitu un cenu, kvadratūru un cenu ir liela korelācija.")
# pdf.write(LINE_HEIGHT, "Starp istabu skaitu un cenu, kvadratūru un cenu ir liela korelācija.")
pdf.image(f"{output_path}/cenu_grafiki.png", w=usable_w)
for txt in text.split("\n"):
pdf.write(LINE_HIGHT, txt.strip())
pdf.ln(LINE_HIGHT)
pdf.write(LINE_HEIGHT, txt.strip())
pdf.ln(LINE_HEIGHT)
average = calc_average(data)
average = calc_mode(data)
# print(average)
for key, value in average.items():
if not isinstance(value, str):
value = str(round(value))
pdf.write(LINE_HIGHT, f"{key} - {value}")
pdf.ln(LINE_HIGHT)
print(f"{key} - {value}")
# if not isinstance(value, str):
# value = str(round(value))
pdf.write(LINE_HEIGHT, f"{key} - {value}")
pdf.ln(LINE_HEIGHT)
response = requests.get(series_photos[average[SERIES]])
img = Image.open(BytesIO(response.content))
pdf.image(img)
# response = requests.get(series_photos[average[SERIES]])
# img = Image.open(BytesIO(response.content))
# pdf.image(img)
pdf.output("output/pdf.pdf")
def calc_average(data):
columns = [FLOOR, ROOM_AMOUNT, SERIES, QUADRATURE]
mean_price_columns = {FLOOR: None, ROOM_AMOUNT: None, SERIES: None, QUADRATURE: None}
for column in columns:
if column == SERIES:
# print(data[column])
# print(f"{column} = {mode(data[column])}")
mean_price_columns[column] = (mode(data[SERIES]))
else:
# print(f"{column} = {mode(data[column])}")
mean_price_columns[column] = mode(data[PRICE]) / mode(data[column])
return mean_price_columns
def calc_mode(data):
mode_columns = {}
for column in COLUMNS:
mode_columns[column] = (mode(data[column]))
# if column == SERIES:
# print(data[column])
# print(f"{column} = {mode(data[column])}")
# else:
# print(f"{column} = {mode(data[column])}")
# mean_price_columns[column] = mode(data[PRICE]) / mode(data[column])
return mode_columns
def graph_plot():
@ -215,9 +213,23 @@ def graph_plot():
create_pdf(data)
def main():
flats_riga = SS("https://www.ss.com/lv/real-estate/flats/riga/all/sell/", "riga")
flats_rigareg = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/", "rigareg")
flats_aizkraukle = SS("https://www.ss.com/lv/real-estate/flats/aizkraukle-and-reg/sell/", "aizkraukle")
flats_tukums = SS("https://www.ss.com/lv/real-estate/flats/tukums-and-reg/sell/", "tukums")
flats_ogre = SS("https://www.ss.com/lv/real-estate/flats/ogre-and-reg/sell/", "ogre")
def main(argv):
for arg in argv:
if arg == "-h" or arg == "--help":
print(f"{__file__} -N --new Scrape new file")
exit()
elif arg == "-n" or arg == "--new":
flats_riga.get_data()
# flats_ogre.get_data()
graph_plot()
if __name__ == "__main__":
main()
main(sys.argv[1:])

View File

@ -0,0 +1,29 @@
beautifulsoup4==4.10.0
bs4==0.0.1
certifi==2021.10.8
charset-normalizer==2.0.12
cycler==0.11.0
et-xmlfile==1.1.0
fonttools==4.29.1
fpdf2==2.5.0
idna==3.3
kiwisolver==1.3.2
load-bar==0.0.7
matplotlib==3.5.1
numpy==1.22.2
openpyxl==3.0.9
packaging==21.3
pandas==1.4.1
Pillow==9.0.1
progressbar2==4.0.0
pyparsing==3.0.7
python-dateutil==2.8.2
python-utils==3.1.0
pytz==2021.3
requests==2.27.1
scipy==1.8.0
seaborn==0.11.2
six==1.16.0
soupsieve==2.3.1
termcolor==1.1.0
urllib3==1.26.8

View File

@ -5,6 +5,8 @@
from bs4 import BeautifulSoup
import requests
import pandas as pd
# import progressbar as pbar
from loadbar import LoadBar
from datetime import datetime
HEADERS = {
@ -14,7 +16,6 @@ HEADERS = {
class SS:
def __init__(self, url, name):
self.url = url
self.name = name
@ -28,16 +29,22 @@ class SS:
page_amount = last_url[last_url.find("page") + 4:last_url.find(".html")]
except:
page_amount = 1
print(f"Page amount = {page_amount}")
# print(f"Page amount = {page_amount}")
return int(page_amount)
def get_data(self):
items = []
item_no = 1
for page_number in range(1, self._get_page_amount() + 1):
url = self.url + f"/page{page_number}.html"
page_amount = self._get_page_amount()
# widgets = ["Getting data...", pbar.Bar("*")]
# bar = pbar.ProgressBar(max_value=page_amount, widgets=widgets).start()
bar = LoadBar(max=page_amount * 30, head="#", body="#")
bar.start()
for page_number in range(1, page_amount + 1):
url = self.url + f"/page{page_number}.html"
page = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(page.content, 'html.parser')
@ -45,11 +52,13 @@ class SS:
ids = [tag['id'] for tag in soup.select('tr[id]')] # creates list with ids
ids = [x for x in ids if "tr_bnr" not in x] # removes "tr_bnr" elements from list
ids.remove("head_line") # removes first "head_line" id
print(f"Page {page_number}")
# print(f"Page {page_number}")
# getting item data
for id in soup.find_all(id=ids):
print(f"Item {item_no}")
# print(f"Item {item_no}")
bar.update(step=item_no)
item_no += 1
for elem in id.find_all(class_='msga2-o pp6'):
@ -73,14 +82,13 @@ class SS:
item_date = item_soup.find_all('td', class_='msg_footer') # gets all 'msg_footer' class'
item_date = item_date[2].get_text() # extracts 3rd element
items.append(item_date[8:18]) # crops date
bar.end()
chunk_size = 8
chunked_items_list = [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)] # combines each 'chunk_size' elements into array
columns = ["Atrašanās vieta", "Istabu skaits", "Kvadratūra", "Stāvs", "Sērija", "Cena", "Pilns sludinājuma teksts", "Izvietošanas datums"]
df = pd.DataFrame(chunked_items_list, columns=columns)
time = datetime.now().strftime("%d%m%y%H%M%S") # current time
df.to_excel(excel_writer=f"output/excel/ss_{self.name}_{time}.xlsx", index=False)
print("Done")
flats_riga = SS("https://www.ss.com/lv/real-estate/flats/riga/all/sell/", "riga")
@ -92,6 +100,7 @@ flats_ogre = SS("https://www.ss.com/lv/real-estate/flats/ogre-and-reg/sell/", "o
def main():
flats_riga.get_data()
# flats_rigareg.get_data()
if __name__ == '__main__':