diff --git a/february/task_180222/output/output.xlsx b/february/task_180222/output/output.xlsx new file mode 100644 index 00000000..f77cd67c Binary files /dev/null and b/february/task_180222/output/output.xlsx differ diff --git a/february/task_180222/pd_pandas_k_f_cagulis.py b/february/task_180222/pd_pandas_k_f_cagulis.py new file mode 100644 index 00000000..fc6e1b1a --- /dev/null +++ b/february/task_180222/pd_pandas_k_f_cagulis.py @@ -0,0 +1,20 @@ +# Author - Kristiāns Francis Cagulis +# Date - 16.02.2022. +# Title - Patstāvīgais darbs - pandas + +from pathlib import Path as p +from ss_scraper import SS + +# flats_few = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/") +# flats_few.get_data() + + +def read(): + pass + + +def address(): + pass + + +print(list(p(p(__file__).parent.absolute()).glob("*/*.xlsx"))) \ No newline at end of file diff --git a/february/task_180222/ss_scraper.py b/february/task_180222/ss_scraper.py new file mode 100644 index 00000000..7a830926 --- /dev/null +++ b/february/task_180222/ss_scraper.py @@ -0,0 +1,86 @@ +# Author - Kristiāns Francis Cagulis +# Date - 07.12.2021 +# Title - Patstāvīgais darbs "SS.com scraping" + +from bs4 import BeautifulSoup +import requests +import pandas as pd + +HEADERS = { + "User-Agent": + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.97 Safari/537.36 Vivaldi/4.1.2369.21' +} + + +class SS: + + def __init__(self, url): + self.url = url + + def _get_page_amount(self): + page = requests.get(self.url, headers=HEADERS) + soup = BeautifulSoup(page.content, 'html.parser') + + last_url = soup.find(class_='td2').findChild('a')['href'] + page_amount = last_url[last_url.find("page") + 4:last_url.find(".html")] + print(f"Page amount = {page_amount}") + + return int(page_amount) + + def get_data(self): + items = [] + item_no = 1 + for page_number in range(1, self._get_page_amount() + 1): + url = self.url + f"/page{page_number}.html" + + page = requests.get(url, headers=HEADERS) + soup = BeautifulSoup(page.content, 'html.parser') + + # item ids + ids = [tag['id'] for tag in soup.select('tr[id]')] # creates list with ids + ids = [x for x in ids if "tr_bnr" not in x] # removes "tr_bnr" elements from list + ids.remove("head_line") # removes first "head_line" id + print(f"Page {page_number}") + + # getting item data + for el in soup.find_all(id=ids): + print(f"Item {item_no}") + item_no += 1 + + for elem in el.find_all(class_='msga2-o pp6'): + items.append(elem.get_text()) + + # adverts url + item_url = el.findChild(class_='msg2').findChild('div').findChild('a')['href'] # gets url + item_url = "https://www.ss.com" + item_url + item_page = requests.get(item_url, headers=HEADERS) + item_soup = BeautifulSoup(item_page.content, 'html.parser') + + # adverts full text + item_text = item_soup.find(id='msg_div_msg').get_text() # gets full text + item_text = item_text[:item_text.find("Pilsēta:")] # removes text last part (table) + items.append(item_text) + + # adverts publication date + item_date = item_soup.find_all('td', class_='msg_footer') # gets all 'msg_footer' class' + item_date = item_date[2].get_text() # extracts 3rd element + items.append(item_date[8:18]) # crops date + + chunk_size = 8 + chunked_items_list = [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)] # combines each 'chunk_size' elements into array + columns = ["Atrašanās vieta", "Istabu skaits", "Kvadratūra", "Stāvs", "Sērija", "Cena", "Pilns sludinājuma teksts", "Izvietošanas datums"] + df = pd.DataFrame(chunked_items_list, columns=columns) + df.to_excel(excel_writer='output/output.xlsx', index=False) + print("Done") + + +flats_many = SS("https://www.ss.com/lv/real-estate/flats/riga/all/sell/") +flats_few = SS("https://www.ss.com/lv/real-estate/flats/riga-region/all/sell/") + + +def main(): + flats_few.get_data() + + +if __name__ == '__main__': + main()