Initial commit

2025-10-21 19:50:33 +00:00 · 2022-09-11 19:26:23 +03:00 · 2022-09-11 19:26:23 +03:00 · 749aa131a6
commit 749aa131a6
4 changed files with 272 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,116 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+config.json
--- a/main.py
+++ b/main.py
@ -0,0 +1,42 @@
+"""
+Telegram bot for scraper
+Author - Kristofers Solo
+Licence - MIT
+"""
+import json
+import logging
+from pathlib import Path
+from aiogram import Bot, Dispatcher, executor, types
+from scraper import gpus
+
+BASE_DIR = Path(__file__).resolve().parent
+
+# Read the token from file
+with open(Path(BASE_DIR, "config.json"), "r", encoding="UTF-8") as config_file:
+    config = json.load(config_file)
+
+API_TOKEN = config["API_TOKEN"]
+
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+
+# Initialize bot and dispatcher
+bot = Bot(token=API_TOKEN)
+dp = Dispatcher(bot)
+
+
+@ dp.message_handler(commands=["gpu", "gpus"])
+async def gpu_price_message(message: types.Message):
+    """Returns all scraped GPUs and their prices to telegram"""
+    data = gpus.get_data()
+    message_size = 100
+    chunked_data = [data[i:i + message_size]
+                    for i in range(0, len(data), message_size)]
+
+    for i in chunked_data:
+        await message.answer("\n".join(i))
+    await message.answer(f"In total {len(data)} GPUs")
+
+
+if __name__ == "__main__":
+    executor.start_polling(dp, skip_updates=True)
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,21 @@
+aiogram==2.22.1
+aiohttp==3.8.1
+aiosignal==1.2.0
+async-timeout==4.0.2
+attrs==22.1.0
+Babel==2.9.1
+beautifulsoup4==4.11.1
+bs4==0.0.1
+certifi==2022.6.15.1
+charset-normalizer==2.1.1
+frozenlist==1.3.1
+idna==3.3
+multidict==6.0.2
+numpy==1.23.2
+python-dateutil==2.8.2
+pytz==2022.2.1
+requests==2.28.1
+six==1.16.0
+soupsieve==2.3.2.post1
+urllib3==1.26.12
+yarl==1.8.1
--- a/scraper.py
+++ b/scraper.py
@ -0,0 +1,93 @@
+"""
+Webscaper of SS marketplace for GPUs
+Author - Kristofers Solo
+Licence - MIT
+"""
+
+from bs4 import BeautifulSoup
+import requests
+
+HEADERS = {
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.97 Safari/537.36 Vivaldi/4.1.2369.21"}
+
+
+class SS:
+    def __init__(self, url):
+        self.url = url
+
+    def _get_page_amount(self) -> int:
+        page = requests.get(self.url, headers=HEADERS)
+        soup = BeautifulSoup(page.content, "html.parser")
+
+        last_url = soup.find(class_="td2").findChild("a")["href"]
+        page_amount = last_url[last_url.find(
+            "page") + 4:last_url.find(".html")]
+        print(f"Page amount = {page_amount}")
+
+        return int(page_amount)
+
+    def get_data(self) -> list:
+        """Runs the scraper"""
+        gpus_list = []
+        for page_number in range(1, self._get_page_amount() + 1):
+            url = self.url + f"/page{page_number}.html"
+            page = requests.get(url, headers=HEADERS)
+            soup = BeautifulSoup(page.content, "html.parser")
+
+            # item ids
+            ids = [tag["id"]
+                   for tag in soup.select("tr[id]")]  # creates list with ids
+            # removes "tr_bnr" elements from list
+            ids = [x for x in ids if "tr_bnr" not in x]
+            ids.remove("head_line")  # removes first "head_line" id
+
+            # getting item data
+            for item_no, elem in enumerate(soup.find_all(id=ids)):
+                print(f"Item {item_no + 1} on page {page_number}")
+
+                # adverts url
+                item = elem.find_all(class_="msga2-o pp6")  # gets url
+                gpu = []
+                for text in item:
+                    gpu.append(text.get_text())
+                for _ in range(3):
+                    gpu.pop(1)
+
+                # removes commas from prices
+                gpu[1] = gpu[1].replace(",", "")
+
+                # removes excessive symbols from price (€ and white-spaces)
+                gpu[1] = gpu[1][:-3]
+
+                # converts prices to float
+                gpu[1] = float(gpu[1])
+
+                gpus_list.append(gpu)
+
+        gpus_list = sorted(gpus_list, key=lambda x: x[1])
+
+        for index, gpu in enumerate(gpus_list):
+            # convert price back to string and add `€`
+            gpu[1] = str(gpu[1]) + " €"
+            # transform 2D array to 1D
+            gpus_list[index] = (" - ".join(gpu))
+
+        return gpus_list
+
+
+gpus = SS("https://www.ss.com/lv/electronics/computers/completing-pc/video/sell")
+
+
+def main():
+    """Main funcion to test scraper"""
+    data = gpus.get_data()
+
+    message_size = 100
+    chunked_data = [data[i:i + message_size]
+                    for i in range(0, len(data), message_size)]
+    for i in chunked_data:
+        print("\n".join(i))
+
+
+if __name__ == "__main__":
+    main()