task_180222

This commit is contained in:
Krisotfers-Solo 2022-02-21 18:55:27 +02:00
parent b3fc9f2cc3
commit 6d100905fe
7 changed files with 86 additions and 60 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 238 KiB

After

Width:  |  Height:  |  Size: 242 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 16 KiB

After

Width:  |  Height:  |  Size: 15 KiB

Binary file not shown.

Binary file not shown.

View File

@ -1,5 +1,5 @@
# Author - Kristiāns Francis Cagulis # Author - Kristiāns Francis Cagulis
# Date - 17.02.2022. # Date - 21.02.2022.
# Title - Patstāvīgais darbs - pandas # Title - Patstāvīgais darbs - pandas
import pandas as pd import pandas as pd
@ -7,10 +7,11 @@ import seaborn as sns
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import requests import requests
import sys import sys
from os import mkdir, listdir
from pathlib import Path from pathlib import Path
from random import randint from random import randint
from fpdf import FPDF from fpdf import FPDF
from statistics import mode from statistics import mode, mean
from PIL import Image from PIL import Image
from io import BytesIO from io import BytesIO
from ss_scraper import SS from ss_scraper import SS
@ -46,14 +47,16 @@ series_photos = {
} }
class priceGraphs: class priceGraph:
def __init__(self, data, pos, x_value, title, xlabel, y_value=PRICE, ylabel="Price"):
def __init__(self, data, pos, title, x_value, xlabel, xticks=None, y_value=PRICE, ylabel="Price"):
self.pos = pos self.pos = pos
self.x_value = data[x_value] self.x_value = data[x_value]
self.y_value = data[y_value] self.y_value = data[y_value]
self.title = title self.title = title
self.xlabel = xlabel self.xlabel = xlabel
self.ylabel = ylabel self.ylabel = ylabel
self.xticks = xticks
def _graph_price(self): def _graph_price(self):
plot = plt.subplot2grid((3, 2), self.pos) plot = plt.subplot2grid((3, 2), self.pos)
@ -61,6 +64,8 @@ class priceGraphs:
plot.set_title(self.title) plot.set_title(self.title)
plot.set_xlabel(self.xlabel) plot.set_xlabel(self.xlabel)
plot.set_ylabel(self.ylabel) plot.set_ylabel(self.ylabel)
if self.xticks != None:
plot.set_xticks(self.xticks)
def read(): def read():
@ -68,9 +73,9 @@ def read():
for file_path in files: for file_path in files:
all_df.append(pd.read_excel(file_path)) all_df.append(pd.read_excel(file_path))
df_combined = pd.concat(all_df).reset_index(drop=True) df_combined = pd.concat(all_df).reset_index(drop=True) # combine DataFrames
df_combined.sort_values(by=[PRICE, PUB_DATE], inplace=True) df_combined.sort_values(by=[PRICE, PUB_DATE], inplace=True) # sort DataFrame
df_combined.drop_duplicates(subset="Pilns sludinājuma teksts", keep=False, inplace=True) df_combined.drop_duplicates(keep=False, inplace=True) # drop duplicates
# replaces floor value to intiger # replaces floor value to intiger
for value in df_combined[FLOOR]: for value in df_combined[FLOOR]:
@ -80,18 +85,19 @@ def read():
for value in df_combined[PRICE]: for value in df_combined[PRICE]:
df_combined = df_combined.replace(value, replace_value(value, " ", ",", "")) df_combined = df_combined.replace(value, replace_value(value, " ", ",", ""))
# replaces "Citi" to 7
for _ in df_combined[ROOM_AMOUNT]: for _ in df_combined[ROOM_AMOUNT]:
df_combined = df_combined.replace(["citi", "Citi"], "7") df_combined = df_combined.replace(["citi", "Citi"], "7")
try:
# converts room amount to intiger
for value in df_combined[ROOM_AMOUNT]: for value in df_combined[ROOM_AMOUNT]:
df_combined = df_combined.replace(value, int(value)) df_combined = df_combined.replace(value, int(value))
except:
pass
# converts to datetime # converts to datetime
df_combined[PUB_DATE] = pd.to_datetime(df_combined[PUB_DATE], format="%d.%m.%Y") df_combined[PUB_DATE] = pd.to_datetime(df_combined[PUB_DATE], format="%d.%m.%Y").dt.date
# df_combined.to_excel("output/excel/combined.xlsx", index=False) # df_combined.to_excel("output/excel/combined.xlsx", index=False)
return df_combined.sort_values(by=[PRICE, PUB_DATE]) return df_combined.sort_values(by=PUB_DATE)
# replace value # replace value
@ -117,11 +123,11 @@ def graph_price(data):
plt.figure(figsize=(50, 30)) plt.figure(figsize=(50, 30))
plt.rc("font", size=15) plt.rc("font", size=15)
plot1 = priceGraphs(data, (0, 0), FLOOR, "Price to floor", "Floor") plot1 = priceGraph(data, (0, 0), "Price to floor", FLOOR, "Floor", range(1, max(data[FLOOR]) + 1))
plot2 = priceGraphs(data, (0, 1), ROOM_AMOUNT, "Price to room amount", "Room amount") plot2 = priceGraph(data, (0, 1), "Price to room amount", ROOM_AMOUNT, "Room amount")
plot3 = priceGraphs(data, (1, 0), QUADRATURE, "Price to quadrature", "Quadrature") plot3 = priceGraph(data, (1, 0), "Price to quadrature", QUADRATURE, "Quadrature")
plot4 = priceGraphs(data, (1, 1), SERIES, "Price to series", "Series") plot4 = priceGraph(data, (1, 1), "Price to series", SERIES, "Series")
plot5 = priceGraphs(data, (2, 0), PUB_DATE, "Price to date", "Date") plot5 = priceGraph(data, (2, 0), "Price to date", PUB_DATE, "Date")
plot1._graph_price() plot1._graph_price()
plot2._graph_price() plot2._graph_price()
@ -143,6 +149,7 @@ def create_pdf(data):
height = pdf.font_size * 2 height = pdf.font_size * 2
LINE_HEIGHT = 5 LINE_HEIGHT = 5
# table head
for column in COLUMNS: for column in COLUMNS:
if column == PUB_DATE: if column == PUB_DATE:
col_width = width * 2 col_width = width * 2
@ -151,9 +158,9 @@ def create_pdf(data):
pdf.cell(col_width, height, column, border=1) pdf.cell(col_width, height, column, border=1)
pdf.ln(height) pdf.ln(height)
# table contents
for _ in range(5): for _ in range(5):
rand_num = randint(2, len(data)) rand_num = randint(2, len(data))
# print(str(data[column].iloc[rand_num])) # TODO: ERROR
for column in COLUMNS: for column in COLUMNS:
if column == PUB_DATE: if column == PUB_DATE:
col_width = width * 2 col_width = width * 2
@ -162,48 +169,57 @@ def create_pdf(data):
pdf.cell(col_width, height, str(data[column].iloc[rand_num]), border=1) pdf.cell(col_width, height, str(data[column].iloc[rand_num]), border=1)
pdf.ln(height) pdf.ln(height)
text = """
"Price to floor" grafiks - lielākā daļa pārdodamo dzīvokļu ir līdz 5. stāvam.
"Price to room amount" grafiks - jo mazāk istabu, jo lētāks dzīvoklis.
"Price to quadrature" grafiks - jo lielāka dzīvokļa platība, jo dārgāks dzīvoklis.
"Price to series" grafiks - dārgākie dzīvokļi ir jaunie.
"Price to date" grafiks - nesen pārdošanā ielikto dzīvokļu ir vairāk.
"""
pdf.ln(height) pdf.ln(height)
pdf.image(f"{output_path}/korelacija.png", w=usable_w) pdf.image(f"{output_path}/korelacija.png", w=usable_w) # corr graph
# pdf.write(LINE_HEIGHT, "Starp istabu skaitu un cenu, kvadratūru un cenu ir liela korelācija.") pdf.write(LINE_HEIGHT, "Starp istabu skaitu un cenu, kvadratūru un cenu ir liela korelācija.")
pdf.image(f"{output_path}/cenu_grafiki.png", w=usable_w) pdf.ln(height)
pdf.image(f"{output_path}/cenu_grafiki.png", w=usable_w) # price graph
# price graph conclusions
text = """
"Price to floor" grafiks - lielākā daļa pārdodamo dzīvokļu ir līdz 6. stāvam.
"Price to room amount" grafiks - veido normālo sadalījumu (Gausa sadalījumu).
"Price to quadrature" grafiks - jo lielāka dzīvokļa platība, jo dārgāks dzīvoklis.
"Price to series" grafiks - jaunie, renovētie un pēc kara dzīvokļi ir dārgāki.
"Price to date" grafiks - nav nekādas sakarības.
"""
for txt in text.split("\n"): for txt in text.split("\n"):
pdf.write(LINE_HEIGHT, txt.strip()) pdf.write(LINE_HEIGHT, txt.strip())
pdf.ln(LINE_HEIGHT) pdf.ln(LINE_HEIGHT)
average = calc_mode(data) # mean/mode values
# print(average) text = [
for key, value in average.items(): "Vidējā cena: ", "Vidējā cena attiecībā pret kvadratūru: ", "Sērijas moda: ", "Vidējā cena attiecībā pret istabu skaitu: ",
print(f"{key} - {value}") "Vidējā cena attiecībā pret stāvu: "
# if not isinstance(value, str): ]
# value = str(round(value)) values = [
pdf.write(LINE_HEIGHT, f"{key} - {value}") round(mean(data[PRICE]), 2),
round(mean(data[PRICE]) / mean(data[QUADRATURE])),
mode(data[SERIES]),
round(mean(data[PRICE]) / mean(data[ROOM_AMOUNT])),
round(mean(data[PRICE]) / mean(data[FLOOR]))
]
for txt, value in zip(text, values):
pdf.write(LINE_HEIGHT, f"{txt}{value}")
pdf.ln(LINE_HEIGHT) pdf.ln(LINE_HEIGHT)
# response = requests.get(series_photos[average[SERIES]]) # adds photo of most frequent series
# img = Image.open(BytesIO(response.content)) response = requests.get(series_photos[mode(data[SERIES])])
# pdf.image(img) img = Image.open(BytesIO(response.content))
pdf.output("output/pdf.pdf") pdf.image(img)
pdf.output("output/pdf/secinajumi.pdf")
def calc_mode(data): def make_dir():
mode_columns = {} if "output" not in listdir():
for column in COLUMNS: mkdir("output")
mode_columns[column] = (mode(data[column])) if "excel" not in listdir("output"):
# if column == SERIES: mkdir("output/excel")
# print(data[column]) if "graphs" not in listdir("output"):
# print(f"{column} = {mode(data[column])}") mkdir("output/graphs")
# else: if "pdf" not in listdir("output"):
# print(f"{column} = {mode(data[column])}") mkdir("output/pdf")
# mean_price_columns[column] = mode(data[PRICE]) / mode(data[column])
return mode_columns
def graph_plot(): def graph_plot():
@ -219,15 +235,24 @@ flats_aizkraukle = SS("https://www.ss.com/lv/real-estate/flats/aizkraukle-and-re
flats_tukums = SS("https://www.ss.com/lv/real-estate/flats/tukums-and-reg/sell/", "tukums") flats_tukums = SS("https://www.ss.com/lv/real-estate/flats/tukums-and-reg/sell/", "tukums")
flats_ogre = SS("https://www.ss.com/lv/real-estate/flats/ogre-and-reg/sell/", "ogre") flats_ogre = SS("https://www.ss.com/lv/real-estate/flats/ogre-and-reg/sell/", "ogre")
OPERATIONS = """
python pd_pandas_k_f_cagulis.py
python pd_pandas_k_f_cagulis.py <operations>
Operations:
-h --help
-n --new Scrape new file
"""
def main(argv): def main(argv):
for arg in argv: for arg in argv:
if arg == "-h" or arg == "--help": if arg in ["-h", "--help"]:
print(f"{__file__} -N --new Scrape new file") print(OPERATIONS)
exit() exit()
elif arg == "-n" or arg == "--new": elif arg in ["-n", "--new"]:
flats_riga.get_data() flats_riga.get_data()
# flats_ogre.get_data() make_dir()
graph_plot() graph_plot()

View File

@ -1,12 +1,12 @@
# Author - Kristiāns Francis Cagulis # Author - Kristiāns Francis Cagulis
# Date - 17.02.2022 # Date - 21.02.2022
# Title - Patstāvīgais darbs "SS.com scraping" # Title - Patstāvīgais darbs "SS.com scraping"
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import requests import requests
import pandas as pd import pandas as pd
# import progressbar as pbar
from loadbar import LoadBar from loadbar import LoadBar
from os import mkdir, listdir
from datetime import datetime from datetime import datetime
HEADERS = { HEADERS = {
@ -16,6 +16,7 @@ HEADERS = {
class SS: class SS:
def __init__(self, url, name): def __init__(self, url, name):
self.url = url self.url = url
self.name = name self.name = name
@ -88,6 +89,8 @@ class SS:
columns = ["Atrašanās vieta", "Istabu skaits", "Kvadratūra", "Stāvs", "Sērija", "Cena", "Pilns sludinājuma teksts", "Izvietošanas datums"] columns = ["Atrašanās vieta", "Istabu skaits", "Kvadratūra", "Stāvs", "Sērija", "Cena", "Pilns sludinājuma teksts", "Izvietošanas datums"]
df = pd.DataFrame(chunked_items_list, columns=columns) df = pd.DataFrame(chunked_items_list, columns=columns)
time = datetime.now().strftime("%d%m%y%H%M%S") # current time time = datetime.now().strftime("%d%m%y%H%M%S") # current time
if "excel" not in listdir("output"):
mkdir("output/excel")
df.to_excel(excel_writer=f"output/excel/ss_{self.name}_{time}.xlsx", index=False) df.to_excel(excel_writer=f"output/excel/ss_{self.name}_{time}.xlsx", index=False)

View File

@ -1,2 +0,0 @@
val = "5.00"
print(int((float(val))))