commit 749aa131a6ff12e25521425d0cd04b49644f1aaf Author: Kristofers Solo Date: Sun Sep 11 19:26:23 2022 +0300 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..626ab9a --- /dev/null +++ b/.gitignore @@ -0,0 +1,116 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +config.json diff --git a/main.py b/main.py new file mode 100644 index 0000000..75fa414 --- /dev/null +++ b/main.py @@ -0,0 +1,42 @@ +""" +Telegram bot for scraper +Author - Kristofers Solo +Licence - MIT +""" +import json +import logging +from pathlib import Path +from aiogram import Bot, Dispatcher, executor, types +from scraper import gpus + +BASE_DIR = Path(__file__).resolve().parent + +# Read the token from file +with open(Path(BASE_DIR, "config.json"), "r", encoding="UTF-8") as config_file: + config = json.load(config_file) + +API_TOKEN = config["API_TOKEN"] + +# Configure logging +logging.basicConfig(level=logging.INFO) + +# Initialize bot and dispatcher +bot = Bot(token=API_TOKEN) +dp = Dispatcher(bot) + + +@ dp.message_handler(commands=["gpu", "gpus"]) +async def gpu_price_message(message: types.Message): + """Returns all scraped GPUs and their prices to telegram""" + data = gpus.get_data() + message_size = 100 + chunked_data = [data[i:i + message_size] + for i in range(0, len(data), message_size)] + + for i in chunked_data: + await message.answer("\n".join(i)) + await message.answer(f"In total {len(data)} GPUs") + + +if __name__ == "__main__": + executor.start_polling(dp, skip_updates=True) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..7849798 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,21 @@ +aiogram==2.22.1 +aiohttp==3.8.1 +aiosignal==1.2.0 +async-timeout==4.0.2 +attrs==22.1.0 +Babel==2.9.1 +beautifulsoup4==4.11.1 +bs4==0.0.1 +certifi==2022.6.15.1 +charset-normalizer==2.1.1 +frozenlist==1.3.1 +idna==3.3 +multidict==6.0.2 +numpy==1.23.2 +python-dateutil==2.8.2 +pytz==2022.2.1 +requests==2.28.1 +six==1.16.0 +soupsieve==2.3.2.post1 +urllib3==1.26.12 +yarl==1.8.1 diff --git a/scraper.py b/scraper.py new file mode 100644 index 0000000..f3cd1ae --- /dev/null +++ b/scraper.py @@ -0,0 +1,93 @@ +""" +Webscaper of SS marketplace for GPUs +Author - Kristofers Solo +Licence - MIT +""" + +from bs4 import BeautifulSoup +import requests + +HEADERS = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.97 Safari/537.36 Vivaldi/4.1.2369.21"} + + +class SS: + def __init__(self, url): + self.url = url + + def _get_page_amount(self) -> int: + page = requests.get(self.url, headers=HEADERS) + soup = BeautifulSoup(page.content, "html.parser") + + last_url = soup.find(class_="td2").findChild("a")["href"] + page_amount = last_url[last_url.find( + "page") + 4:last_url.find(".html")] + print(f"Page amount = {page_amount}") + + return int(page_amount) + + def get_data(self) -> list: + """Runs the scraper""" + gpus_list = [] + for page_number in range(1, self._get_page_amount() + 1): + url = self.url + f"/page{page_number}.html" + page = requests.get(url, headers=HEADERS) + soup = BeautifulSoup(page.content, "html.parser") + + # item ids + ids = [tag["id"] + for tag in soup.select("tr[id]")] # creates list with ids + # removes "tr_bnr" elements from list + ids = [x for x in ids if "tr_bnr" not in x] + ids.remove("head_line") # removes first "head_line" id + + # getting item data + for item_no, elem in enumerate(soup.find_all(id=ids)): + print(f"Item {item_no + 1} on page {page_number}") + + # adverts url + item = elem.find_all(class_="msga2-o pp6") # gets url + gpu = [] + for text in item: + gpu.append(text.get_text()) + for _ in range(3): + gpu.pop(1) + + # removes commas from prices + gpu[1] = gpu[1].replace(",", "") + + # removes excessive symbols from price (€ and white-spaces) + gpu[1] = gpu[1][:-3] + + # converts prices to float + gpu[1] = float(gpu[1]) + + gpus_list.append(gpu) + + gpus_list = sorted(gpus_list, key=lambda x: x[1]) + + for index, gpu in enumerate(gpus_list): + # convert price back to string and add `€` + gpu[1] = str(gpu[1]) + " €" + # transform 2D array to 1D + gpus_list[index] = (" - ".join(gpu)) + + return gpus_list + + +gpus = SS("https://www.ss.com/lv/electronics/computers/completing-pc/video/sell") + + +def main(): + """Main funcion to test scraper""" + data = gpus.get_data() + + message_size = 100 + chunked_data = [data[i:i + message_size] + for i in range(0, len(data), message_size)] + for i in chunked_data: + print("\n".join(i)) + + +if __name__ == "__main__": + main()