mirror of
https://github.com/kristoferssolo/School.git
synced 2025-10-21 20:10:38 +00:00
task_180222
This commit is contained in:
parent
2489e585e5
commit
b3fc9f2cc3
BIN
february/task_180222/output/excel/ss_riga_210222131848.xlsx
Normal file
BIN
february/task_180222/output/excel/ss_riga_210222131848.xlsx
Normal file
Binary file not shown.
Binary file not shown.
|
Before Width: | Height: | Size: 258 KiB After Width: | Height: | Size: 238 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 14 KiB After Width: | Height: | Size: 16 KiB |
Binary file not shown.
@ -6,6 +6,7 @@ import pandas as pd
|
|||||||
import seaborn as sns
|
import seaborn as sns
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import requests
|
import requests
|
||||||
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from random import randint
|
from random import randint
|
||||||
from fpdf import FPDF
|
from fpdf import FPDF
|
||||||
@ -14,8 +15,6 @@ from PIL import Image
|
|||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from ss_scraper import SS
|
from ss_scraper import SS
|
||||||
|
|
||||||
# flats_few = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/")
|
|
||||||
# flats_few.get_data()
|
|
||||||
output_path = "output/graphs"
|
output_path = "output/graphs"
|
||||||
all_df = []
|
all_df = []
|
||||||
|
|
||||||
@ -26,6 +25,8 @@ SERIES = "Sērija"
|
|||||||
ROOM_AMOUNT = "Istabu skaits"
|
ROOM_AMOUNT = "Istabu skaits"
|
||||||
PUB_DATE = "Izvietošanas datums"
|
PUB_DATE = "Izvietošanas datums"
|
||||||
|
|
||||||
|
COLUMNS = [PRICE, FLOOR, ROOM_AMOUNT, SERIES, QUADRATURE, PUB_DATE]
|
||||||
|
|
||||||
series_photos = {
|
series_photos = {
|
||||||
"103.": "https://i.ss.com/gallery/5/902/225301/45060087.th2.jpg",
|
"103.": "https://i.ss.com/gallery/5/902/225301/45060087.th2.jpg",
|
||||||
"104.": "https://i.ss.com/gallery/5/888/221910/44381841.th2.jpg",
|
"104.": "https://i.ss.com/gallery/5/888/221910/44381841.th2.jpg",
|
||||||
@ -46,7 +47,6 @@ series_photos = {
|
|||||||
|
|
||||||
|
|
||||||
class priceGraphs:
|
class priceGraphs:
|
||||||
|
|
||||||
def __init__(self, data, pos, x_value, title, xlabel, y_value=PRICE, ylabel="Price"):
|
def __init__(self, data, pos, x_value, title, xlabel, y_value=PRICE, ylabel="Price"):
|
||||||
self.pos = pos
|
self.pos = pos
|
||||||
self.x_value = data[x_value]
|
self.x_value = data[x_value]
|
||||||
@ -81,7 +81,7 @@ def read():
|
|||||||
df_combined = df_combined.replace(value, replace_value(value, " ", ",", ""))
|
df_combined = df_combined.replace(value, replace_value(value, " ", ",", ""))
|
||||||
|
|
||||||
for _ in df_combined[ROOM_AMOUNT]:
|
for _ in df_combined[ROOM_AMOUNT]:
|
||||||
df_combined = df_combined.replace(["citi", "Citi"], "2")
|
df_combined = df_combined.replace(["citi", "Citi"], "7")
|
||||||
try:
|
try:
|
||||||
for value in df_combined[ROOM_AMOUNT]:
|
for value in df_combined[ROOM_AMOUNT]:
|
||||||
df_combined = df_combined.replace(value, int(value))
|
df_combined = df_combined.replace(value, int(value))
|
||||||
@ -111,7 +111,6 @@ def graph_corr(data):
|
|||||||
|
|
||||||
sns.heatmap(data_corr.corr())
|
sns.heatmap(data_corr.corr())
|
||||||
plt.savefig(f"{output_path}/korelacija.png")
|
plt.savefig(f"{output_path}/korelacija.png")
|
||||||
calc_average(data_corr)
|
|
||||||
|
|
||||||
|
|
||||||
def graph_price(data):
|
def graph_price(data):
|
||||||
@ -141,29 +140,27 @@ def create_pdf(data):
|
|||||||
|
|
||||||
usable_w = pdf.w - 2 * pdf.l_margin
|
usable_w = pdf.w - 2 * pdf.l_margin
|
||||||
width = usable_w / 7
|
width = usable_w / 7
|
||||||
hight = pdf.font_size * 2
|
height = pdf.font_size * 2
|
||||||
LINE_HIGHT = 5
|
LINE_HEIGHT = 5
|
||||||
|
|
||||||
columns = [PRICE, FLOOR, ROOM_AMOUNT, SERIES, QUADRATURE, PUB_DATE]
|
for column in COLUMNS:
|
||||||
|
|
||||||
for column in columns:
|
|
||||||
if column == PUB_DATE:
|
if column == PUB_DATE:
|
||||||
col_width = width * 2
|
col_width = width * 2
|
||||||
else:
|
else:
|
||||||
col_width = width
|
col_width = width
|
||||||
pdf.cell(col_width, hight, column, border=1)
|
pdf.cell(col_width, height, column, border=1)
|
||||||
|
|
||||||
pdf.ln(hight)
|
pdf.ln(height)
|
||||||
pdf.set_font()
|
|
||||||
for _ in range(5):
|
for _ in range(5):
|
||||||
rand_num = randint(2, len(data) - 10)
|
rand_num = randint(2, len(data))
|
||||||
for column in columns:
|
# print(str(data[column].iloc[rand_num])) # TODO: ERROR
|
||||||
|
for column in COLUMNS:
|
||||||
if column == PUB_DATE:
|
if column == PUB_DATE:
|
||||||
col_width = width * 2
|
col_width = width * 2
|
||||||
else:
|
else:
|
||||||
col_width = width
|
col_width = width
|
||||||
pdf.cell(col_width, hight, str(data[column][rand_num]), border=1)
|
pdf.cell(col_width, height, str(data[column].iloc[rand_num]), border=1)
|
||||||
pdf.ln(hight)
|
pdf.ln(height)
|
||||||
|
|
||||||
text = """
|
text = """
|
||||||
"Price to floor" grafiks - lielākā daļa pārdodamo dzīvokļu ir līdz 5. stāvam.
|
"Price to floor" grafiks - lielākā daļa pārdodamo dzīvokļu ir līdz 5. stāvam.
|
||||||
@ -172,40 +169,41 @@ def create_pdf(data):
|
|||||||
"Price to series" grafiks - dārgākie dzīvokļi ir jaunie.
|
"Price to series" grafiks - dārgākie dzīvokļi ir jaunie.
|
||||||
"Price to date" grafiks - nesen pārdošanā ielikto dzīvokļu ir vairāk.
|
"Price to date" grafiks - nesen pārdošanā ielikto dzīvokļu ir vairāk.
|
||||||
"""
|
"""
|
||||||
pdf.ln(hight)
|
pdf.ln(height)
|
||||||
pdf.image(f"{output_path}/korelacija.png", w=usable_w)
|
pdf.image(f"{output_path}/korelacija.png", w=usable_w)
|
||||||
# pdf.write(LINE_HIGHT, "Starp istabu skaitu un cenu, kvadratūru un cenu ir liela korelācija.")
|
# pdf.write(LINE_HEIGHT, "Starp istabu skaitu un cenu, kvadratūru un cenu ir liela korelācija.")
|
||||||
pdf.image(f"{output_path}/cenu_grafiki.png", w=usable_w)
|
pdf.image(f"{output_path}/cenu_grafiki.png", w=usable_w)
|
||||||
|
|
||||||
for txt in text.split("\n"):
|
for txt in text.split("\n"):
|
||||||
pdf.write(LINE_HIGHT, txt.strip())
|
pdf.write(LINE_HEIGHT, txt.strip())
|
||||||
pdf.ln(LINE_HIGHT)
|
pdf.ln(LINE_HEIGHT)
|
||||||
|
|
||||||
average = calc_average(data)
|
average = calc_mode(data)
|
||||||
|
# print(average)
|
||||||
for key, value in average.items():
|
for key, value in average.items():
|
||||||
if not isinstance(value, str):
|
print(f"{key} - {value}")
|
||||||
value = str(round(value))
|
# if not isinstance(value, str):
|
||||||
pdf.write(LINE_HIGHT, f"{key} - {value}")
|
# value = str(round(value))
|
||||||
pdf.ln(LINE_HIGHT)
|
pdf.write(LINE_HEIGHT, f"{key} - {value}")
|
||||||
|
pdf.ln(LINE_HEIGHT)
|
||||||
|
|
||||||
response = requests.get(series_photos[average[SERIES]])
|
# response = requests.get(series_photos[average[SERIES]])
|
||||||
img = Image.open(BytesIO(response.content))
|
# img = Image.open(BytesIO(response.content))
|
||||||
pdf.image(img)
|
# pdf.image(img)
|
||||||
pdf.output("output/pdf.pdf")
|
pdf.output("output/pdf.pdf")
|
||||||
|
|
||||||
|
|
||||||
def calc_average(data):
|
def calc_mode(data):
|
||||||
columns = [FLOOR, ROOM_AMOUNT, SERIES, QUADRATURE]
|
mode_columns = {}
|
||||||
mean_price_columns = {FLOOR: None, ROOM_AMOUNT: None, SERIES: None, QUADRATURE: None}
|
for column in COLUMNS:
|
||||||
for column in columns:
|
mode_columns[column] = (mode(data[column]))
|
||||||
if column == SERIES:
|
# if column == SERIES:
|
||||||
# print(data[column])
|
# print(data[column])
|
||||||
# print(f"{column} = {mode(data[column])}")
|
# print(f"{column} = {mode(data[column])}")
|
||||||
mean_price_columns[column] = (mode(data[SERIES]))
|
# else:
|
||||||
else:
|
# print(f"{column} = {mode(data[column])}")
|
||||||
# print(f"{column} = {mode(data[column])}")
|
# mean_price_columns[column] = mode(data[PRICE]) / mode(data[column])
|
||||||
mean_price_columns[column] = mode(data[PRICE]) / mode(data[column])
|
return mode_columns
|
||||||
return mean_price_columns
|
|
||||||
|
|
||||||
|
|
||||||
def graph_plot():
|
def graph_plot():
|
||||||
@ -215,9 +213,23 @@ def graph_plot():
|
|||||||
create_pdf(data)
|
create_pdf(data)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
flats_riga = SS("https://www.ss.com/lv/real-estate/flats/riga/all/sell/", "riga")
|
||||||
|
flats_rigareg = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/", "rigareg")
|
||||||
|
flats_aizkraukle = SS("https://www.ss.com/lv/real-estate/flats/aizkraukle-and-reg/sell/", "aizkraukle")
|
||||||
|
flats_tukums = SS("https://www.ss.com/lv/real-estate/flats/tukums-and-reg/sell/", "tukums")
|
||||||
|
flats_ogre = SS("https://www.ss.com/lv/real-estate/flats/ogre-and-reg/sell/", "ogre")
|
||||||
|
|
||||||
|
|
||||||
|
def main(argv):
|
||||||
|
for arg in argv:
|
||||||
|
if arg == "-h" or arg == "--help":
|
||||||
|
print(f"{__file__} -N --new Scrape new file")
|
||||||
|
exit()
|
||||||
|
elif arg == "-n" or arg == "--new":
|
||||||
|
flats_riga.get_data()
|
||||||
|
# flats_ogre.get_data()
|
||||||
graph_plot()
|
graph_plot()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main(sys.argv[1:])
|
||||||
29
february/task_180222/requirements.txt
Normal file
29
february/task_180222/requirements.txt
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
beautifulsoup4==4.10.0
|
||||||
|
bs4==0.0.1
|
||||||
|
certifi==2021.10.8
|
||||||
|
charset-normalizer==2.0.12
|
||||||
|
cycler==0.11.0
|
||||||
|
et-xmlfile==1.1.0
|
||||||
|
fonttools==4.29.1
|
||||||
|
fpdf2==2.5.0
|
||||||
|
idna==3.3
|
||||||
|
kiwisolver==1.3.2
|
||||||
|
load-bar==0.0.7
|
||||||
|
matplotlib==3.5.1
|
||||||
|
numpy==1.22.2
|
||||||
|
openpyxl==3.0.9
|
||||||
|
packaging==21.3
|
||||||
|
pandas==1.4.1
|
||||||
|
Pillow==9.0.1
|
||||||
|
progressbar2==4.0.0
|
||||||
|
pyparsing==3.0.7
|
||||||
|
python-dateutil==2.8.2
|
||||||
|
python-utils==3.1.0
|
||||||
|
pytz==2021.3
|
||||||
|
requests==2.27.1
|
||||||
|
scipy==1.8.0
|
||||||
|
seaborn==0.11.2
|
||||||
|
six==1.16.0
|
||||||
|
soupsieve==2.3.1
|
||||||
|
termcolor==1.1.0
|
||||||
|
urllib3==1.26.8
|
||||||
@ -5,6 +5,8 @@
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import requests
|
import requests
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
# import progressbar as pbar
|
||||||
|
from loadbar import LoadBar
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
HEADERS = {
|
HEADERS = {
|
||||||
@ -14,7 +16,6 @@ HEADERS = {
|
|||||||
|
|
||||||
|
|
||||||
class SS:
|
class SS:
|
||||||
|
|
||||||
def __init__(self, url, name):
|
def __init__(self, url, name):
|
||||||
self.url = url
|
self.url = url
|
||||||
self.name = name
|
self.name = name
|
||||||
@ -28,16 +29,22 @@ class SS:
|
|||||||
page_amount = last_url[last_url.find("page") + 4:last_url.find(".html")]
|
page_amount = last_url[last_url.find("page") + 4:last_url.find(".html")]
|
||||||
except:
|
except:
|
||||||
page_amount = 1
|
page_amount = 1
|
||||||
print(f"Page amount = {page_amount}")
|
# print(f"Page amount = {page_amount}")
|
||||||
|
|
||||||
return int(page_amount)
|
return int(page_amount)
|
||||||
|
|
||||||
def get_data(self):
|
def get_data(self):
|
||||||
items = []
|
items = []
|
||||||
item_no = 1
|
item_no = 1
|
||||||
for page_number in range(1, self._get_page_amount() + 1):
|
page_amount = self._get_page_amount()
|
||||||
url = self.url + f"/page{page_number}.html"
|
# widgets = ["Getting data...", pbar.Bar("*")]
|
||||||
|
# bar = pbar.ProgressBar(max_value=page_amount, widgets=widgets).start()
|
||||||
|
bar = LoadBar(max=page_amount * 30, head="#", body="#")
|
||||||
|
bar.start()
|
||||||
|
|
||||||
|
for page_number in range(1, page_amount + 1):
|
||||||
|
|
||||||
|
url = self.url + f"/page{page_number}.html"
|
||||||
page = requests.get(url, headers=HEADERS)
|
page = requests.get(url, headers=HEADERS)
|
||||||
soup = BeautifulSoup(page.content, 'html.parser')
|
soup = BeautifulSoup(page.content, 'html.parser')
|
||||||
|
|
||||||
@ -45,11 +52,13 @@ class SS:
|
|||||||
ids = [tag['id'] for tag in soup.select('tr[id]')] # creates list with ids
|
ids = [tag['id'] for tag in soup.select('tr[id]')] # creates list with ids
|
||||||
ids = [x for x in ids if "tr_bnr" not in x] # removes "tr_bnr" elements from list
|
ids = [x for x in ids if "tr_bnr" not in x] # removes "tr_bnr" elements from list
|
||||||
ids.remove("head_line") # removes first "head_line" id
|
ids.remove("head_line") # removes first "head_line" id
|
||||||
print(f"Page {page_number}")
|
# print(f"Page {page_number}")
|
||||||
|
|
||||||
# getting item data
|
# getting item data
|
||||||
for id in soup.find_all(id=ids):
|
for id in soup.find_all(id=ids):
|
||||||
print(f"Item {item_no}")
|
# print(f"Item {item_no}")
|
||||||
|
bar.update(step=item_no)
|
||||||
|
|
||||||
item_no += 1
|
item_no += 1
|
||||||
|
|
||||||
for elem in id.find_all(class_='msga2-o pp6'):
|
for elem in id.find_all(class_='msga2-o pp6'):
|
||||||
@ -73,14 +82,13 @@ class SS:
|
|||||||
item_date = item_soup.find_all('td', class_='msg_footer') # gets all 'msg_footer' class'
|
item_date = item_soup.find_all('td', class_='msg_footer') # gets all 'msg_footer' class'
|
||||||
item_date = item_date[2].get_text() # extracts 3rd element
|
item_date = item_date[2].get_text() # extracts 3rd element
|
||||||
items.append(item_date[8:18]) # crops date
|
items.append(item_date[8:18]) # crops date
|
||||||
|
bar.end()
|
||||||
chunk_size = 8
|
chunk_size = 8
|
||||||
chunked_items_list = [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)] # combines each 'chunk_size' elements into array
|
chunked_items_list = [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)] # combines each 'chunk_size' elements into array
|
||||||
columns = ["Atrašanās vieta", "Istabu skaits", "Kvadratūra", "Stāvs", "Sērija", "Cena", "Pilns sludinājuma teksts", "Izvietošanas datums"]
|
columns = ["Atrašanās vieta", "Istabu skaits", "Kvadratūra", "Stāvs", "Sērija", "Cena", "Pilns sludinājuma teksts", "Izvietošanas datums"]
|
||||||
df = pd.DataFrame(chunked_items_list, columns=columns)
|
df = pd.DataFrame(chunked_items_list, columns=columns)
|
||||||
time = datetime.now().strftime("%d%m%y%H%M%S") # current time
|
time = datetime.now().strftime("%d%m%y%H%M%S") # current time
|
||||||
df.to_excel(excel_writer=f"output/excel/ss_{self.name}_{time}.xlsx", index=False)
|
df.to_excel(excel_writer=f"output/excel/ss_{self.name}_{time}.xlsx", index=False)
|
||||||
print("Done")
|
|
||||||
|
|
||||||
|
|
||||||
flats_riga = SS("https://www.ss.com/lv/real-estate/flats/riga/all/sell/", "riga")
|
flats_riga = SS("https://www.ss.com/lv/real-estate/flats/riga/all/sell/", "riga")
|
||||||
@ -92,6 +100,7 @@ flats_ogre = SS("https://www.ss.com/lv/real-estate/flats/ogre-and-reg/sell/", "o
|
|||||||
|
|
||||||
def main():
|
def main():
|
||||||
flats_riga.get_data()
|
flats_riga.get_data()
|
||||||
|
# flats_rigareg.get_data()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user