mirror of
https://github.com/kristoferssolo/School.git
synced 2025-10-21 20:10:38 +00:00
task_180222
This commit is contained in:
parent
2489e585e5
commit
b3fc9f2cc3
BIN
february/task_180222/output/excel/ss_riga_210222131848.xlsx
Normal file
BIN
february/task_180222/output/excel/ss_riga_210222131848.xlsx
Normal file
Binary file not shown.
Binary file not shown.
|
Before Width: | Height: | Size: 258 KiB After Width: | Height: | Size: 238 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 14 KiB After Width: | Height: | Size: 16 KiB |
Binary file not shown.
@ -6,6 +6,7 @@ import pandas as pd
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
import requests
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from random import randint
|
||||
from fpdf import FPDF
|
||||
@ -14,8 +15,6 @@ from PIL import Image
|
||||
from io import BytesIO
|
||||
from ss_scraper import SS
|
||||
|
||||
# flats_few = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/")
|
||||
# flats_few.get_data()
|
||||
output_path = "output/graphs"
|
||||
all_df = []
|
||||
|
||||
@ -26,6 +25,8 @@ SERIES = "Sērija"
|
||||
ROOM_AMOUNT = "Istabu skaits"
|
||||
PUB_DATE = "Izvietošanas datums"
|
||||
|
||||
COLUMNS = [PRICE, FLOOR, ROOM_AMOUNT, SERIES, QUADRATURE, PUB_DATE]
|
||||
|
||||
series_photos = {
|
||||
"103.": "https://i.ss.com/gallery/5/902/225301/45060087.th2.jpg",
|
||||
"104.": "https://i.ss.com/gallery/5/888/221910/44381841.th2.jpg",
|
||||
@ -46,7 +47,6 @@ series_photos = {
|
||||
|
||||
|
||||
class priceGraphs:
|
||||
|
||||
def __init__(self, data, pos, x_value, title, xlabel, y_value=PRICE, ylabel="Price"):
|
||||
self.pos = pos
|
||||
self.x_value = data[x_value]
|
||||
@ -81,7 +81,7 @@ def read():
|
||||
df_combined = df_combined.replace(value, replace_value(value, " ", ",", ""))
|
||||
|
||||
for _ in df_combined[ROOM_AMOUNT]:
|
||||
df_combined = df_combined.replace(["citi", "Citi"], "2")
|
||||
df_combined = df_combined.replace(["citi", "Citi"], "7")
|
||||
try:
|
||||
for value in df_combined[ROOM_AMOUNT]:
|
||||
df_combined = df_combined.replace(value, int(value))
|
||||
@ -111,7 +111,6 @@ def graph_corr(data):
|
||||
|
||||
sns.heatmap(data_corr.corr())
|
||||
plt.savefig(f"{output_path}/korelacija.png")
|
||||
calc_average(data_corr)
|
||||
|
||||
|
||||
def graph_price(data):
|
||||
@ -141,29 +140,27 @@ def create_pdf(data):
|
||||
|
||||
usable_w = pdf.w - 2 * pdf.l_margin
|
||||
width = usable_w / 7
|
||||
hight = pdf.font_size * 2
|
||||
LINE_HIGHT = 5
|
||||
height = pdf.font_size * 2
|
||||
LINE_HEIGHT = 5
|
||||
|
||||
columns = [PRICE, FLOOR, ROOM_AMOUNT, SERIES, QUADRATURE, PUB_DATE]
|
||||
|
||||
for column in columns:
|
||||
for column in COLUMNS:
|
||||
if column == PUB_DATE:
|
||||
col_width = width * 2
|
||||
else:
|
||||
col_width = width
|
||||
pdf.cell(col_width, hight, column, border=1)
|
||||
pdf.cell(col_width, height, column, border=1)
|
||||
|
||||
pdf.ln(hight)
|
||||
pdf.set_font()
|
||||
pdf.ln(height)
|
||||
for _ in range(5):
|
||||
rand_num = randint(2, len(data) - 10)
|
||||
for column in columns:
|
||||
rand_num = randint(2, len(data))
|
||||
# print(str(data[column].iloc[rand_num])) # TODO: ERROR
|
||||
for column in COLUMNS:
|
||||
if column == PUB_DATE:
|
||||
col_width = width * 2
|
||||
else:
|
||||
col_width = width
|
||||
pdf.cell(col_width, hight, str(data[column][rand_num]), border=1)
|
||||
pdf.ln(hight)
|
||||
pdf.cell(col_width, height, str(data[column].iloc[rand_num]), border=1)
|
||||
pdf.ln(height)
|
||||
|
||||
text = """
|
||||
"Price to floor" grafiks - lielākā daļa pārdodamo dzīvokļu ir līdz 5. stāvam.
|
||||
@ -172,40 +169,41 @@ def create_pdf(data):
|
||||
"Price to series" grafiks - dārgākie dzīvokļi ir jaunie.
|
||||
"Price to date" grafiks - nesen pārdošanā ielikto dzīvokļu ir vairāk.
|
||||
"""
|
||||
pdf.ln(hight)
|
||||
pdf.ln(height)
|
||||
pdf.image(f"{output_path}/korelacija.png", w=usable_w)
|
||||
# pdf.write(LINE_HIGHT, "Starp istabu skaitu un cenu, kvadratūru un cenu ir liela korelācija.")
|
||||
# pdf.write(LINE_HEIGHT, "Starp istabu skaitu un cenu, kvadratūru un cenu ir liela korelācija.")
|
||||
pdf.image(f"{output_path}/cenu_grafiki.png", w=usable_w)
|
||||
|
||||
for txt in text.split("\n"):
|
||||
pdf.write(LINE_HIGHT, txt.strip())
|
||||
pdf.ln(LINE_HIGHT)
|
||||
pdf.write(LINE_HEIGHT, txt.strip())
|
||||
pdf.ln(LINE_HEIGHT)
|
||||
|
||||
average = calc_average(data)
|
||||
average = calc_mode(data)
|
||||
# print(average)
|
||||
for key, value in average.items():
|
||||
if not isinstance(value, str):
|
||||
value = str(round(value))
|
||||
pdf.write(LINE_HIGHT, f"{key} - {value}")
|
||||
pdf.ln(LINE_HIGHT)
|
||||
print(f"{key} - {value}")
|
||||
# if not isinstance(value, str):
|
||||
# value = str(round(value))
|
||||
pdf.write(LINE_HEIGHT, f"{key} - {value}")
|
||||
pdf.ln(LINE_HEIGHT)
|
||||
|
||||
response = requests.get(series_photos[average[SERIES]])
|
||||
img = Image.open(BytesIO(response.content))
|
||||
pdf.image(img)
|
||||
# response = requests.get(series_photos[average[SERIES]])
|
||||
# img = Image.open(BytesIO(response.content))
|
||||
# pdf.image(img)
|
||||
pdf.output("output/pdf.pdf")
|
||||
|
||||
|
||||
def calc_average(data):
|
||||
columns = [FLOOR, ROOM_AMOUNT, SERIES, QUADRATURE]
|
||||
mean_price_columns = {FLOOR: None, ROOM_AMOUNT: None, SERIES: None, QUADRATURE: None}
|
||||
for column in columns:
|
||||
if column == SERIES:
|
||||
# print(data[column])
|
||||
# print(f"{column} = {mode(data[column])}")
|
||||
mean_price_columns[column] = (mode(data[SERIES]))
|
||||
else:
|
||||
# print(f"{column} = {mode(data[column])}")
|
||||
mean_price_columns[column] = mode(data[PRICE]) / mode(data[column])
|
||||
return mean_price_columns
|
||||
def calc_mode(data):
|
||||
mode_columns = {}
|
||||
for column in COLUMNS:
|
||||
mode_columns[column] = (mode(data[column]))
|
||||
# if column == SERIES:
|
||||
# print(data[column])
|
||||
# print(f"{column} = {mode(data[column])}")
|
||||
# else:
|
||||
# print(f"{column} = {mode(data[column])}")
|
||||
# mean_price_columns[column] = mode(data[PRICE]) / mode(data[column])
|
||||
return mode_columns
|
||||
|
||||
|
||||
def graph_plot():
|
||||
@ -215,9 +213,23 @@ def graph_plot():
|
||||
create_pdf(data)
|
||||
|
||||
|
||||
def main():
|
||||
flats_riga = SS("https://www.ss.com/lv/real-estate/flats/riga/all/sell/", "riga")
|
||||
flats_rigareg = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/", "rigareg")
|
||||
flats_aizkraukle = SS("https://www.ss.com/lv/real-estate/flats/aizkraukle-and-reg/sell/", "aizkraukle")
|
||||
flats_tukums = SS("https://www.ss.com/lv/real-estate/flats/tukums-and-reg/sell/", "tukums")
|
||||
flats_ogre = SS("https://www.ss.com/lv/real-estate/flats/ogre-and-reg/sell/", "ogre")
|
||||
|
||||
|
||||
def main(argv):
|
||||
for arg in argv:
|
||||
if arg == "-h" or arg == "--help":
|
||||
print(f"{__file__} -N --new Scrape new file")
|
||||
exit()
|
||||
elif arg == "-n" or arg == "--new":
|
||||
flats_riga.get_data()
|
||||
# flats_ogre.get_data()
|
||||
graph_plot()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main(sys.argv[1:])
|
||||
29
february/task_180222/requirements.txt
Normal file
29
february/task_180222/requirements.txt
Normal file
@ -0,0 +1,29 @@
|
||||
beautifulsoup4==4.10.0
|
||||
bs4==0.0.1
|
||||
certifi==2021.10.8
|
||||
charset-normalizer==2.0.12
|
||||
cycler==0.11.0
|
||||
et-xmlfile==1.1.0
|
||||
fonttools==4.29.1
|
||||
fpdf2==2.5.0
|
||||
idna==3.3
|
||||
kiwisolver==1.3.2
|
||||
load-bar==0.0.7
|
||||
matplotlib==3.5.1
|
||||
numpy==1.22.2
|
||||
openpyxl==3.0.9
|
||||
packaging==21.3
|
||||
pandas==1.4.1
|
||||
Pillow==9.0.1
|
||||
progressbar2==4.0.0
|
||||
pyparsing==3.0.7
|
||||
python-dateutil==2.8.2
|
||||
python-utils==3.1.0
|
||||
pytz==2021.3
|
||||
requests==2.27.1
|
||||
scipy==1.8.0
|
||||
seaborn==0.11.2
|
||||
six==1.16.0
|
||||
soupsieve==2.3.1
|
||||
termcolor==1.1.0
|
||||
urllib3==1.26.8
|
||||
@ -5,6 +5,8 @@
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
import pandas as pd
|
||||
# import progressbar as pbar
|
||||
from loadbar import LoadBar
|
||||
from datetime import datetime
|
||||
|
||||
HEADERS = {
|
||||
@ -14,7 +16,6 @@ HEADERS = {
|
||||
|
||||
|
||||
class SS:
|
||||
|
||||
def __init__(self, url, name):
|
||||
self.url = url
|
||||
self.name = name
|
||||
@ -28,16 +29,22 @@ class SS:
|
||||
page_amount = last_url[last_url.find("page") + 4:last_url.find(".html")]
|
||||
except:
|
||||
page_amount = 1
|
||||
print(f"Page amount = {page_amount}")
|
||||
# print(f"Page amount = {page_amount}")
|
||||
|
||||
return int(page_amount)
|
||||
|
||||
def get_data(self):
|
||||
items = []
|
||||
item_no = 1
|
||||
for page_number in range(1, self._get_page_amount() + 1):
|
||||
url = self.url + f"/page{page_number}.html"
|
||||
page_amount = self._get_page_amount()
|
||||
# widgets = ["Getting data...", pbar.Bar("*")]
|
||||
# bar = pbar.ProgressBar(max_value=page_amount, widgets=widgets).start()
|
||||
bar = LoadBar(max=page_amount * 30, head="#", body="#")
|
||||
bar.start()
|
||||
|
||||
for page_number in range(1, page_amount + 1):
|
||||
|
||||
url = self.url + f"/page{page_number}.html"
|
||||
page = requests.get(url, headers=HEADERS)
|
||||
soup = BeautifulSoup(page.content, 'html.parser')
|
||||
|
||||
@ -45,11 +52,13 @@ class SS:
|
||||
ids = [tag['id'] for tag in soup.select('tr[id]')] # creates list with ids
|
||||
ids = [x for x in ids if "tr_bnr" not in x] # removes "tr_bnr" elements from list
|
||||
ids.remove("head_line") # removes first "head_line" id
|
||||
print(f"Page {page_number}")
|
||||
# print(f"Page {page_number}")
|
||||
|
||||
# getting item data
|
||||
for id in soup.find_all(id=ids):
|
||||
print(f"Item {item_no}")
|
||||
# print(f"Item {item_no}")
|
||||
bar.update(step=item_no)
|
||||
|
||||
item_no += 1
|
||||
|
||||
for elem in id.find_all(class_='msga2-o pp6'):
|
||||
@ -73,14 +82,13 @@ class SS:
|
||||
item_date = item_soup.find_all('td', class_='msg_footer') # gets all 'msg_footer' class'
|
||||
item_date = item_date[2].get_text() # extracts 3rd element
|
||||
items.append(item_date[8:18]) # crops date
|
||||
|
||||
bar.end()
|
||||
chunk_size = 8
|
||||
chunked_items_list = [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)] # combines each 'chunk_size' elements into array
|
||||
columns = ["Atrašanās vieta", "Istabu skaits", "Kvadratūra", "Stāvs", "Sērija", "Cena", "Pilns sludinājuma teksts", "Izvietošanas datums"]
|
||||
df = pd.DataFrame(chunked_items_list, columns=columns)
|
||||
time = datetime.now().strftime("%d%m%y%H%M%S") # current time
|
||||
df.to_excel(excel_writer=f"output/excel/ss_{self.name}_{time}.xlsx", index=False)
|
||||
print("Done")
|
||||
|
||||
|
||||
flats_riga = SS("https://www.ss.com/lv/real-estate/flats/riga/all/sell/", "riga")
|
||||
@ -92,6 +100,7 @@ flats_ogre = SS("https://www.ss.com/lv/real-estate/flats/ogre-and-reg/sell/", "o
|
||||
|
||||
def main():
|
||||
flats_riga.get_data()
|
||||
# flats_rigareg.get_data()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
Loading…
Reference in New Issue
Block a user