mirror of
https://github.com/kristoferssolo/SScom-scraper.git
synced 2025-10-21 19:50:33 +00:00
Initial commit
This commit is contained in:
commit
749aa131a6
116
.gitignore
vendored
Normal file
116
.gitignore
vendored
Normal file
@ -0,0 +1,116 @@
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# celery beat schedule file
|
||||
celerybeat-schedule
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
config.json
|
||||
42
main.py
Normal file
42
main.py
Normal file
@ -0,0 +1,42 @@
|
||||
"""
|
||||
Telegram bot for scraper
|
||||
Author - Kristofers Solo
|
||||
Licence - MIT
|
||||
"""
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from aiogram import Bot, Dispatcher, executor, types
|
||||
from scraper import gpus
|
||||
|
||||
BASE_DIR = Path(__file__).resolve().parent
|
||||
|
||||
# Read the token from file
|
||||
with open(Path(BASE_DIR, "config.json"), "r", encoding="UTF-8") as config_file:
|
||||
config = json.load(config_file)
|
||||
|
||||
API_TOKEN = config["API_TOKEN"]
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
# Initialize bot and dispatcher
|
||||
bot = Bot(token=API_TOKEN)
|
||||
dp = Dispatcher(bot)
|
||||
|
||||
|
||||
@ dp.message_handler(commands=["gpu", "gpus"])
|
||||
async def gpu_price_message(message: types.Message):
|
||||
"""Returns all scraped GPUs and their prices to telegram"""
|
||||
data = gpus.get_data()
|
||||
message_size = 100
|
||||
chunked_data = [data[i:i + message_size]
|
||||
for i in range(0, len(data), message_size)]
|
||||
|
||||
for i in chunked_data:
|
||||
await message.answer("\n".join(i))
|
||||
await message.answer(f"In total {len(data)} GPUs")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
executor.start_polling(dp, skip_updates=True)
|
||||
21
requirements.txt
Normal file
21
requirements.txt
Normal file
@ -0,0 +1,21 @@
|
||||
aiogram==2.22.1
|
||||
aiohttp==3.8.1
|
||||
aiosignal==1.2.0
|
||||
async-timeout==4.0.2
|
||||
attrs==22.1.0
|
||||
Babel==2.9.1
|
||||
beautifulsoup4==4.11.1
|
||||
bs4==0.0.1
|
||||
certifi==2022.6.15.1
|
||||
charset-normalizer==2.1.1
|
||||
frozenlist==1.3.1
|
||||
idna==3.3
|
||||
multidict==6.0.2
|
||||
numpy==1.23.2
|
||||
python-dateutil==2.8.2
|
||||
pytz==2022.2.1
|
||||
requests==2.28.1
|
||||
six==1.16.0
|
||||
soupsieve==2.3.2.post1
|
||||
urllib3==1.26.12
|
||||
yarl==1.8.1
|
||||
93
scraper.py
Normal file
93
scraper.py
Normal file
@ -0,0 +1,93 @@
|
||||
"""
|
||||
Webscaper of SS marketplace for GPUs
|
||||
Author - Kristofers Solo
|
||||
Licence - MIT
|
||||
"""
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.97 Safari/537.36 Vivaldi/4.1.2369.21"}
|
||||
|
||||
|
||||
class SS:
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
|
||||
def _get_page_amount(self) -> int:
|
||||
page = requests.get(self.url, headers=HEADERS)
|
||||
soup = BeautifulSoup(page.content, "html.parser")
|
||||
|
||||
last_url = soup.find(class_="td2").findChild("a")["href"]
|
||||
page_amount = last_url[last_url.find(
|
||||
"page") + 4:last_url.find(".html")]
|
||||
print(f"Page amount = {page_amount}")
|
||||
|
||||
return int(page_amount)
|
||||
|
||||
def get_data(self) -> list:
|
||||
"""Runs the scraper"""
|
||||
gpus_list = []
|
||||
for page_number in range(1, self._get_page_amount() + 1):
|
||||
url = self.url + f"/page{page_number}.html"
|
||||
page = requests.get(url, headers=HEADERS)
|
||||
soup = BeautifulSoup(page.content, "html.parser")
|
||||
|
||||
# item ids
|
||||
ids = [tag["id"]
|
||||
for tag in soup.select("tr[id]")] # creates list with ids
|
||||
# removes "tr_bnr" elements from list
|
||||
ids = [x for x in ids if "tr_bnr" not in x]
|
||||
ids.remove("head_line") # removes first "head_line" id
|
||||
|
||||
# getting item data
|
||||
for item_no, elem in enumerate(soup.find_all(id=ids)):
|
||||
print(f"Item {item_no + 1} on page {page_number}")
|
||||
|
||||
# adverts url
|
||||
item = elem.find_all(class_="msga2-o pp6") # gets url
|
||||
gpu = []
|
||||
for text in item:
|
||||
gpu.append(text.get_text())
|
||||
for _ in range(3):
|
||||
gpu.pop(1)
|
||||
|
||||
# removes commas from prices
|
||||
gpu[1] = gpu[1].replace(",", "")
|
||||
|
||||
# removes excessive symbols from price (€ and white-spaces)
|
||||
gpu[1] = gpu[1][:-3]
|
||||
|
||||
# converts prices to float
|
||||
gpu[1] = float(gpu[1])
|
||||
|
||||
gpus_list.append(gpu)
|
||||
|
||||
gpus_list = sorted(gpus_list, key=lambda x: x[1])
|
||||
|
||||
for index, gpu in enumerate(gpus_list):
|
||||
# convert price back to string and add `€`
|
||||
gpu[1] = str(gpu[1]) + " €"
|
||||
# transform 2D array to 1D
|
||||
gpus_list[index] = (" - ".join(gpu))
|
||||
|
||||
return gpus_list
|
||||
|
||||
|
||||
gpus = SS("https://www.ss.com/lv/electronics/computers/completing-pc/video/sell")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main funcion to test scraper"""
|
||||
data = gpus.get_data()
|
||||
|
||||
message_size = 100
|
||||
chunked_data = [data[i:i + message_size]
|
||||
for i in range(0, len(data), message_size)]
|
||||
for i in chunked_data:
|
||||
print("\n".join(i))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in New Issue
Block a user