Initial commit

This commit is contained in:
Kristofers Solo 2022-09-11 19:26:23 +03:00
commit 749aa131a6
4 changed files with 272 additions and 0 deletions

116
.gitignore vendored Normal file
View File

@ -0,0 +1,116 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
config.json

42
main.py Normal file
View File

@ -0,0 +1,42 @@
"""
Telegram bot for scraper
Author - Kristofers Solo
Licence - MIT
"""
import json
import logging
from pathlib import Path
from aiogram import Bot, Dispatcher, executor, types
from scraper import gpus
BASE_DIR = Path(__file__).resolve().parent
# Read the token from file
with open(Path(BASE_DIR, "config.json"), "r", encoding="UTF-8") as config_file:
config = json.load(config_file)
API_TOKEN = config["API_TOKEN"]
# Configure logging
logging.basicConfig(level=logging.INFO)
# Initialize bot and dispatcher
bot = Bot(token=API_TOKEN)
dp = Dispatcher(bot)
@ dp.message_handler(commands=["gpu", "gpus"])
async def gpu_price_message(message: types.Message):
"""Returns all scraped GPUs and their prices to telegram"""
data = gpus.get_data()
message_size = 100
chunked_data = [data[i:i + message_size]
for i in range(0, len(data), message_size)]
for i in chunked_data:
await message.answer("\n".join(i))
await message.answer(f"In total {len(data)} GPUs")
if __name__ == "__main__":
executor.start_polling(dp, skip_updates=True)

21
requirements.txt Normal file
View File

@ -0,0 +1,21 @@
aiogram==2.22.1
aiohttp==3.8.1
aiosignal==1.2.0
async-timeout==4.0.2
attrs==22.1.0
Babel==2.9.1
beautifulsoup4==4.11.1
bs4==0.0.1
certifi==2022.6.15.1
charset-normalizer==2.1.1
frozenlist==1.3.1
idna==3.3
multidict==6.0.2
numpy==1.23.2
python-dateutil==2.8.2
pytz==2022.2.1
requests==2.28.1
six==1.16.0
soupsieve==2.3.2.post1
urllib3==1.26.12
yarl==1.8.1

93
scraper.py Normal file
View File

@ -0,0 +1,93 @@
"""
Webscaper of SS marketplace for GPUs
Author - Kristofers Solo
Licence - MIT
"""
from bs4 import BeautifulSoup
import requests
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.97 Safari/537.36 Vivaldi/4.1.2369.21"}
class SS:
def __init__(self, url):
self.url = url
def _get_page_amount(self) -> int:
page = requests.get(self.url, headers=HEADERS)
soup = BeautifulSoup(page.content, "html.parser")
last_url = soup.find(class_="td2").findChild("a")["href"]
page_amount = last_url[last_url.find(
"page") + 4:last_url.find(".html")]
print(f"Page amount = {page_amount}")
return int(page_amount)
def get_data(self) -> list:
"""Runs the scraper"""
gpus_list = []
for page_number in range(1, self._get_page_amount() + 1):
url = self.url + f"/page{page_number}.html"
page = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(page.content, "html.parser")
# item ids
ids = [tag["id"]
for tag in soup.select("tr[id]")] # creates list with ids
# removes "tr_bnr" elements from list
ids = [x for x in ids if "tr_bnr" not in x]
ids.remove("head_line") # removes first "head_line" id
# getting item data
for item_no, elem in enumerate(soup.find_all(id=ids)):
print(f"Item {item_no + 1} on page {page_number}")
# adverts url
item = elem.find_all(class_="msga2-o pp6") # gets url
gpu = []
for text in item:
gpu.append(text.get_text())
for _ in range(3):
gpu.pop(1)
# removes commas from prices
gpu[1] = gpu[1].replace(",", "")
# removes excessive symbols from price (€ and white-spaces)
gpu[1] = gpu[1][:-3]
# converts prices to float
gpu[1] = float(gpu[1])
gpus_list.append(gpu)
gpus_list = sorted(gpus_list, key=lambda x: x[1])
for index, gpu in enumerate(gpus_list):
# convert price back to string and add `€`
gpu[1] = str(gpu[1]) + ""
# transform 2D array to 1D
gpus_list[index] = (" - ".join(gpu))
return gpus_list
gpus = SS("https://www.ss.com/lv/electronics/computers/completing-pc/video/sell")
def main():
"""Main funcion to test scraper"""
data = gpus.get_data()
message_size = 100
chunked_data = [data[i:i + message_size]
for i in range(0, len(data), message_size)]
for i in chunked_data:
print("\n".join(i))
if __name__ == "__main__":
main()