mirror of
https://github.com/kristoferssolo/School.git
synced 2025-10-21 20:10:38 +00:00
task_180222
This commit is contained in:
parent
0bba2c52a2
commit
1a085b2a84
BIN
february/task_180222/output/output.xlsx
Normal file
BIN
february/task_180222/output/output.xlsx
Normal file
Binary file not shown.
20
february/task_180222/pd_pandas_k_f_cagulis.py
Normal file
20
february/task_180222/pd_pandas_k_f_cagulis.py
Normal file
@ -0,0 +1,20 @@
|
||||
# Author - Kristiāns Francis Cagulis
|
||||
# Date - 16.02.2022.
|
||||
# Title - Patstāvīgais darbs - pandas
|
||||
|
||||
from pathlib import Path as p
|
||||
from ss_scraper import SS
|
||||
|
||||
# flats_few = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/")
|
||||
# flats_few.get_data()
|
||||
|
||||
|
||||
def read():
|
||||
pass
|
||||
|
||||
|
||||
def address():
|
||||
pass
|
||||
|
||||
|
||||
print(list(p(p(__file__).parent.absolute()).glob("*/*.xlsx")))
|
||||
86
february/task_180222/ss_scraper.py
Normal file
86
february/task_180222/ss_scraper.py
Normal file
@ -0,0 +1,86 @@
|
||||
# Author - Kristiāns Francis Cagulis
|
||||
# Date - 07.12.2021
|
||||
# Title - Patstāvīgais darbs "SS.com scraping"
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
import pandas as pd
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent":
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.97 Safari/537.36 Vivaldi/4.1.2369.21'
|
||||
}
|
||||
|
||||
|
||||
class SS:
|
||||
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
|
||||
def _get_page_amount(self):
|
||||
page = requests.get(self.url, headers=HEADERS)
|
||||
soup = BeautifulSoup(page.content, 'html.parser')
|
||||
|
||||
last_url = soup.find(class_='td2').findChild('a')['href']
|
||||
page_amount = last_url[last_url.find("page") + 4:last_url.find(".html")]
|
||||
print(f"Page amount = {page_amount}")
|
||||
|
||||
return int(page_amount)
|
||||
|
||||
def get_data(self):
|
||||
items = []
|
||||
item_no = 1
|
||||
for page_number in range(1, self._get_page_amount() + 1):
|
||||
url = self.url + f"/page{page_number}.html"
|
||||
|
||||
page = requests.get(url, headers=HEADERS)
|
||||
soup = BeautifulSoup(page.content, 'html.parser')
|
||||
|
||||
# item ids
|
||||
ids = [tag['id'] for tag in soup.select('tr[id]')] # creates list with ids
|
||||
ids = [x for x in ids if "tr_bnr" not in x] # removes "tr_bnr" elements from list
|
||||
ids.remove("head_line") # removes first "head_line" id
|
||||
print(f"Page {page_number}")
|
||||
|
||||
# getting item data
|
||||
for el in soup.find_all(id=ids):
|
||||
print(f"Item {item_no}")
|
||||
item_no += 1
|
||||
|
||||
for elem in el.find_all(class_='msga2-o pp6'):
|
||||
items.append(elem.get_text())
|
||||
|
||||
# adverts url
|
||||
item_url = el.findChild(class_='msg2').findChild('div').findChild('a')['href'] # gets url
|
||||
item_url = "https://www.ss.com" + item_url
|
||||
item_page = requests.get(item_url, headers=HEADERS)
|
||||
item_soup = BeautifulSoup(item_page.content, 'html.parser')
|
||||
|
||||
# adverts full text
|
||||
item_text = item_soup.find(id='msg_div_msg').get_text() # gets full text
|
||||
item_text = item_text[:item_text.find("Pilsēta:")] # removes text last part (table)
|
||||
items.append(item_text)
|
||||
|
||||
# adverts publication date
|
||||
item_date = item_soup.find_all('td', class_='msg_footer') # gets all 'msg_footer' class'
|
||||
item_date = item_date[2].get_text() # extracts 3rd element
|
||||
items.append(item_date[8:18]) # crops date
|
||||
|
||||
chunk_size = 8
|
||||
chunked_items_list = [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)] # combines each 'chunk_size' elements into array
|
||||
columns = ["Atrašanās vieta", "Istabu skaits", "Kvadratūra", "Stāvs", "Sērija", "Cena", "Pilns sludinājuma teksts", "Izvietošanas datums"]
|
||||
df = pd.DataFrame(chunked_items_list, columns=columns)
|
||||
df.to_excel(excel_writer='output/output.xlsx', index=False)
|
||||
print("Done")
|
||||
|
||||
|
||||
flats_many = SS("https://www.ss.com/lv/real-estate/flats/riga/all/sell/")
|
||||
flats_few = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/")
|
||||
|
||||
|
||||
def main():
|
||||
flats_few.get_data()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Loading…
Reference in New Issue
Block a user