mirror of
https://github.com/kristoferssolo/School.git
synced 2025-10-21 20:10:38 +00:00
task_180222
This commit is contained in:
parent
b3fc9f2cc3
commit
6d100905fe
Binary file not shown.
|
Before Width: | Height: | Size: 238 KiB After Width: | Height: | Size: 242 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 16 KiB After Width: | Height: | Size: 15 KiB |
Binary file not shown.
BIN
february/task_180222/output/pdf/secinajumi.pdf
Normal file
BIN
february/task_180222/output/pdf/secinajumi.pdf
Normal file
Binary file not shown.
@ -1,5 +1,5 @@
|
|||||||
# Author - Kristiāns Francis Cagulis
|
# Author - Kristiāns Francis Cagulis
|
||||||
# Date - 17.02.2022.
|
# Date - 21.02.2022.
|
||||||
# Title - Patstāvīgais darbs - pandas
|
# Title - Patstāvīgais darbs - pandas
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@ -7,10 +7,11 @@ import seaborn as sns
|
|||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import requests
|
import requests
|
||||||
import sys
|
import sys
|
||||||
|
from os import mkdir, listdir
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from random import randint
|
from random import randint
|
||||||
from fpdf import FPDF
|
from fpdf import FPDF
|
||||||
from statistics import mode
|
from statistics import mode, mean
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from ss_scraper import SS
|
from ss_scraper import SS
|
||||||
@ -46,14 +47,16 @@ series_photos = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class priceGraphs:
|
class priceGraph:
|
||||||
def __init__(self, data, pos, x_value, title, xlabel, y_value=PRICE, ylabel="Price"):
|
|
||||||
|
def __init__(self, data, pos, title, x_value, xlabel, xticks=None, y_value=PRICE, ylabel="Price"):
|
||||||
self.pos = pos
|
self.pos = pos
|
||||||
self.x_value = data[x_value]
|
self.x_value = data[x_value]
|
||||||
self.y_value = data[y_value]
|
self.y_value = data[y_value]
|
||||||
self.title = title
|
self.title = title
|
||||||
self.xlabel = xlabel
|
self.xlabel = xlabel
|
||||||
self.ylabel = ylabel
|
self.ylabel = ylabel
|
||||||
|
self.xticks = xticks
|
||||||
|
|
||||||
def _graph_price(self):
|
def _graph_price(self):
|
||||||
plot = plt.subplot2grid((3, 2), self.pos)
|
plot = plt.subplot2grid((3, 2), self.pos)
|
||||||
@ -61,6 +64,8 @@ class priceGraphs:
|
|||||||
plot.set_title(self.title)
|
plot.set_title(self.title)
|
||||||
plot.set_xlabel(self.xlabel)
|
plot.set_xlabel(self.xlabel)
|
||||||
plot.set_ylabel(self.ylabel)
|
plot.set_ylabel(self.ylabel)
|
||||||
|
if self.xticks != None:
|
||||||
|
plot.set_xticks(self.xticks)
|
||||||
|
|
||||||
|
|
||||||
def read():
|
def read():
|
||||||
@ -68,9 +73,9 @@ def read():
|
|||||||
|
|
||||||
for file_path in files:
|
for file_path in files:
|
||||||
all_df.append(pd.read_excel(file_path))
|
all_df.append(pd.read_excel(file_path))
|
||||||
df_combined = pd.concat(all_df).reset_index(drop=True)
|
df_combined = pd.concat(all_df).reset_index(drop=True) # combine DataFrames
|
||||||
df_combined.sort_values(by=[PRICE, PUB_DATE], inplace=True)
|
df_combined.sort_values(by=[PRICE, PUB_DATE], inplace=True) # sort DataFrame
|
||||||
df_combined.drop_duplicates(subset="Pilns sludinājuma teksts", keep=False, inplace=True)
|
df_combined.drop_duplicates(keep=False, inplace=True) # drop duplicates
|
||||||
|
|
||||||
# replaces floor value to intiger
|
# replaces floor value to intiger
|
||||||
for value in df_combined[FLOOR]:
|
for value in df_combined[FLOOR]:
|
||||||
@ -80,18 +85,19 @@ def read():
|
|||||||
for value in df_combined[PRICE]:
|
for value in df_combined[PRICE]:
|
||||||
df_combined = df_combined.replace(value, replace_value(value, " ", ",", ""))
|
df_combined = df_combined.replace(value, replace_value(value, " ", ",", ""))
|
||||||
|
|
||||||
|
# replaces "Citi" to 7
|
||||||
for _ in df_combined[ROOM_AMOUNT]:
|
for _ in df_combined[ROOM_AMOUNT]:
|
||||||
df_combined = df_combined.replace(["citi", "Citi"], "7")
|
df_combined = df_combined.replace(["citi", "Citi"], "7")
|
||||||
try:
|
|
||||||
for value in df_combined[ROOM_AMOUNT]:
|
# converts room amount to intiger
|
||||||
df_combined = df_combined.replace(value, int(value))
|
for value in df_combined[ROOM_AMOUNT]:
|
||||||
except:
|
df_combined = df_combined.replace(value, int(value))
|
||||||
pass
|
|
||||||
# converts to datetime
|
# converts to datetime
|
||||||
df_combined[PUB_DATE] = pd.to_datetime(df_combined[PUB_DATE], format="%d.%m.%Y")
|
df_combined[PUB_DATE] = pd.to_datetime(df_combined[PUB_DATE], format="%d.%m.%Y").dt.date
|
||||||
|
|
||||||
# df_combined.to_excel("output/excel/combined.xlsx", index=False)
|
# df_combined.to_excel("output/excel/combined.xlsx", index=False)
|
||||||
return df_combined.sort_values(by=[PRICE, PUB_DATE])
|
return df_combined.sort_values(by=PUB_DATE)
|
||||||
|
|
||||||
|
|
||||||
# replace value
|
# replace value
|
||||||
@ -117,11 +123,11 @@ def graph_price(data):
|
|||||||
plt.figure(figsize=(50, 30))
|
plt.figure(figsize=(50, 30))
|
||||||
plt.rc("font", size=15)
|
plt.rc("font", size=15)
|
||||||
|
|
||||||
plot1 = priceGraphs(data, (0, 0), FLOOR, "Price to floor", "Floor")
|
plot1 = priceGraph(data, (0, 0), "Price to floor", FLOOR, "Floor", range(1, max(data[FLOOR]) + 1))
|
||||||
plot2 = priceGraphs(data, (0, 1), ROOM_AMOUNT, "Price to room amount", "Room amount")
|
plot2 = priceGraph(data, (0, 1), "Price to room amount", ROOM_AMOUNT, "Room amount")
|
||||||
plot3 = priceGraphs(data, (1, 0), QUADRATURE, "Price to quadrature", "Quadrature")
|
plot3 = priceGraph(data, (1, 0), "Price to quadrature", QUADRATURE, "Quadrature")
|
||||||
plot4 = priceGraphs(data, (1, 1), SERIES, "Price to series", "Series")
|
plot4 = priceGraph(data, (1, 1), "Price to series", SERIES, "Series")
|
||||||
plot5 = priceGraphs(data, (2, 0), PUB_DATE, "Price to date", "Date")
|
plot5 = priceGraph(data, (2, 0), "Price to date", PUB_DATE, "Date")
|
||||||
|
|
||||||
plot1._graph_price()
|
plot1._graph_price()
|
||||||
plot2._graph_price()
|
plot2._graph_price()
|
||||||
@ -143,6 +149,7 @@ def create_pdf(data):
|
|||||||
height = pdf.font_size * 2
|
height = pdf.font_size * 2
|
||||||
LINE_HEIGHT = 5
|
LINE_HEIGHT = 5
|
||||||
|
|
||||||
|
# table head
|
||||||
for column in COLUMNS:
|
for column in COLUMNS:
|
||||||
if column == PUB_DATE:
|
if column == PUB_DATE:
|
||||||
col_width = width * 2
|
col_width = width * 2
|
||||||
@ -151,9 +158,9 @@ def create_pdf(data):
|
|||||||
pdf.cell(col_width, height, column, border=1)
|
pdf.cell(col_width, height, column, border=1)
|
||||||
|
|
||||||
pdf.ln(height)
|
pdf.ln(height)
|
||||||
|
# table contents
|
||||||
for _ in range(5):
|
for _ in range(5):
|
||||||
rand_num = randint(2, len(data))
|
rand_num = randint(2, len(data))
|
||||||
# print(str(data[column].iloc[rand_num])) # TODO: ERROR
|
|
||||||
for column in COLUMNS:
|
for column in COLUMNS:
|
||||||
if column == PUB_DATE:
|
if column == PUB_DATE:
|
||||||
col_width = width * 2
|
col_width = width * 2
|
||||||
@ -162,48 +169,57 @@ def create_pdf(data):
|
|||||||
pdf.cell(col_width, height, str(data[column].iloc[rand_num]), border=1)
|
pdf.cell(col_width, height, str(data[column].iloc[rand_num]), border=1)
|
||||||
pdf.ln(height)
|
pdf.ln(height)
|
||||||
|
|
||||||
text = """
|
|
||||||
"Price to floor" grafiks - lielākā daļa pārdodamo dzīvokļu ir līdz 5. stāvam.
|
|
||||||
"Price to room amount" grafiks - jo mazāk istabu, jo lētāks dzīvoklis.
|
|
||||||
"Price to quadrature" grafiks - jo lielāka dzīvokļa platība, jo dārgāks dzīvoklis.
|
|
||||||
"Price to series" grafiks - dārgākie dzīvokļi ir jaunie.
|
|
||||||
"Price to date" grafiks - nesen pārdošanā ielikto dzīvokļu ir vairāk.
|
|
||||||
"""
|
|
||||||
pdf.ln(height)
|
pdf.ln(height)
|
||||||
pdf.image(f"{output_path}/korelacija.png", w=usable_w)
|
pdf.image(f"{output_path}/korelacija.png", w=usable_w) # corr graph
|
||||||
# pdf.write(LINE_HEIGHT, "Starp istabu skaitu un cenu, kvadratūru un cenu ir liela korelācija.")
|
pdf.write(LINE_HEIGHT, "Starp istabu skaitu un cenu, kvadratūru un cenu ir liela korelācija.")
|
||||||
pdf.image(f"{output_path}/cenu_grafiki.png", w=usable_w)
|
pdf.ln(height)
|
||||||
|
pdf.image(f"{output_path}/cenu_grafiki.png", w=usable_w) # price graph
|
||||||
|
|
||||||
|
# price graph conclusions
|
||||||
|
text = """
|
||||||
|
"Price to floor" grafiks - lielākā daļa pārdodamo dzīvokļu ir līdz 6. stāvam.
|
||||||
|
"Price to room amount" grafiks - veido normālo sadalījumu (Gausa sadalījumu).
|
||||||
|
"Price to quadrature" grafiks - jo lielāka dzīvokļa platība, jo dārgāks dzīvoklis.
|
||||||
|
"Price to series" grafiks - jaunie, renovētie un pēc kara dzīvokļi ir dārgāki.
|
||||||
|
"Price to date" grafiks - nav nekādas sakarības.
|
||||||
|
"""
|
||||||
for txt in text.split("\n"):
|
for txt in text.split("\n"):
|
||||||
pdf.write(LINE_HEIGHT, txt.strip())
|
pdf.write(LINE_HEIGHT, txt.strip())
|
||||||
pdf.ln(LINE_HEIGHT)
|
pdf.ln(LINE_HEIGHT)
|
||||||
|
|
||||||
average = calc_mode(data)
|
# mean/mode values
|
||||||
# print(average)
|
text = [
|
||||||
for key, value in average.items():
|
"Vidējā cena: ", "Vidējā cena attiecībā pret kvadratūru: ", "Sērijas moda: ", "Vidējā cena attiecībā pret istabu skaitu: ",
|
||||||
print(f"{key} - {value}")
|
"Vidējā cena attiecībā pret stāvu: "
|
||||||
# if not isinstance(value, str):
|
]
|
||||||
# value = str(round(value))
|
values = [
|
||||||
pdf.write(LINE_HEIGHT, f"{key} - {value}")
|
round(mean(data[PRICE]), 2),
|
||||||
|
round(mean(data[PRICE]) / mean(data[QUADRATURE])),
|
||||||
|
mode(data[SERIES]),
|
||||||
|
round(mean(data[PRICE]) / mean(data[ROOM_AMOUNT])),
|
||||||
|
round(mean(data[PRICE]) / mean(data[FLOOR]))
|
||||||
|
]
|
||||||
|
for txt, value in zip(text, values):
|
||||||
|
pdf.write(LINE_HEIGHT, f"{txt}{value}")
|
||||||
pdf.ln(LINE_HEIGHT)
|
pdf.ln(LINE_HEIGHT)
|
||||||
|
|
||||||
# response = requests.get(series_photos[average[SERIES]])
|
# adds photo of most frequent series
|
||||||
# img = Image.open(BytesIO(response.content))
|
response = requests.get(series_photos[mode(data[SERIES])])
|
||||||
# pdf.image(img)
|
img = Image.open(BytesIO(response.content))
|
||||||
pdf.output("output/pdf.pdf")
|
pdf.image(img)
|
||||||
|
|
||||||
|
pdf.output("output/pdf/secinajumi.pdf")
|
||||||
|
|
||||||
|
|
||||||
def calc_mode(data):
|
def make_dir():
|
||||||
mode_columns = {}
|
if "output" not in listdir():
|
||||||
for column in COLUMNS:
|
mkdir("output")
|
||||||
mode_columns[column] = (mode(data[column]))
|
if "excel" not in listdir("output"):
|
||||||
# if column == SERIES:
|
mkdir("output/excel")
|
||||||
# print(data[column])
|
if "graphs" not in listdir("output"):
|
||||||
# print(f"{column} = {mode(data[column])}")
|
mkdir("output/graphs")
|
||||||
# else:
|
if "pdf" not in listdir("output"):
|
||||||
# print(f"{column} = {mode(data[column])}")
|
mkdir("output/pdf")
|
||||||
# mean_price_columns[column] = mode(data[PRICE]) / mode(data[column])
|
|
||||||
return mode_columns
|
|
||||||
|
|
||||||
|
|
||||||
def graph_plot():
|
def graph_plot():
|
||||||
@ -219,15 +235,24 @@ flats_aizkraukle = SS("https://www.ss.com/lv/real-estate/flats/aizkraukle-and-re
|
|||||||
flats_tukums = SS("https://www.ss.com/lv/real-estate/flats/tukums-and-reg/sell/", "tukums")
|
flats_tukums = SS("https://www.ss.com/lv/real-estate/flats/tukums-and-reg/sell/", "tukums")
|
||||||
flats_ogre = SS("https://www.ss.com/lv/real-estate/flats/ogre-and-reg/sell/", "ogre")
|
flats_ogre = SS("https://www.ss.com/lv/real-estate/flats/ogre-and-reg/sell/", "ogre")
|
||||||
|
|
||||||
|
OPERATIONS = """
|
||||||
|
python pd_pandas_k_f_cagulis.py
|
||||||
|
python pd_pandas_k_f_cagulis.py <operations>
|
||||||
|
|
||||||
|
Operations:
|
||||||
|
-h --help
|
||||||
|
-n --new Scrape new file
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
def main(argv):
|
def main(argv):
|
||||||
for arg in argv:
|
for arg in argv:
|
||||||
if arg == "-h" or arg == "--help":
|
if arg in ["-h", "--help"]:
|
||||||
print(f"{__file__} -N --new Scrape new file")
|
print(OPERATIONS)
|
||||||
exit()
|
exit()
|
||||||
elif arg == "-n" or arg == "--new":
|
elif arg in ["-n", "--new"]:
|
||||||
flats_riga.get_data()
|
flats_riga.get_data()
|
||||||
# flats_ogre.get_data()
|
make_dir()
|
||||||
graph_plot()
|
graph_plot()
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,12 +1,12 @@
|
|||||||
# Author - Kristiāns Francis Cagulis
|
# Author - Kristiāns Francis Cagulis
|
||||||
# Date - 17.02.2022
|
# Date - 21.02.2022
|
||||||
# Title - Patstāvīgais darbs "SS.com scraping"
|
# Title - Patstāvīgais darbs "SS.com scraping"
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import requests
|
import requests
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
# import progressbar as pbar
|
|
||||||
from loadbar import LoadBar
|
from loadbar import LoadBar
|
||||||
|
from os import mkdir, listdir
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
HEADERS = {
|
HEADERS = {
|
||||||
@ -16,6 +16,7 @@ HEADERS = {
|
|||||||
|
|
||||||
|
|
||||||
class SS:
|
class SS:
|
||||||
|
|
||||||
def __init__(self, url, name):
|
def __init__(self, url, name):
|
||||||
self.url = url
|
self.url = url
|
||||||
self.name = name
|
self.name = name
|
||||||
@ -88,6 +89,8 @@ class SS:
|
|||||||
columns = ["Atrašanās vieta", "Istabu skaits", "Kvadratūra", "Stāvs", "Sērija", "Cena", "Pilns sludinājuma teksts", "Izvietošanas datums"]
|
columns = ["Atrašanās vieta", "Istabu skaits", "Kvadratūra", "Stāvs", "Sērija", "Cena", "Pilns sludinājuma teksts", "Izvietošanas datums"]
|
||||||
df = pd.DataFrame(chunked_items_list, columns=columns)
|
df = pd.DataFrame(chunked_items_list, columns=columns)
|
||||||
time = datetime.now().strftime("%d%m%y%H%M%S") # current time
|
time = datetime.now().strftime("%d%m%y%H%M%S") # current time
|
||||||
|
if "excel" not in listdir("output"):
|
||||||
|
mkdir("output/excel")
|
||||||
df.to_excel(excel_writer=f"output/excel/ss_{self.name}_{time}.xlsx", index=False)
|
df.to_excel(excel_writer=f"output/excel/ss_{self.name}_{time}.xlsx", index=False)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,2 +0,0 @@
|
|||||||
val = "5.00"
|
|
||||||
print(int((float(val))))
|
|
||||||
Loading…
Reference in New Issue
Block a user