Skip to content

Commit

Permalink
Scryfall data broker (#22)
Browse files Browse the repository at this point in the history
* more docker fixes networking and names

* get scryfall bulk data and json file

* add example env

* change example env name

* a few notes, add test data, update git ignore

* set up python venv and organizing

* cleanup filenames and README

* differ working

* Logger added

* finding oldest file, condensing

* log to file

* logs moved

* remove limit on data

* update README

* Delete scryfall-data-broker/env directory
  • Loading branch information
mathewmorris authored Oct 12, 2024
1 parent 36030a5 commit 65a1709
Show file tree
Hide file tree
Showing 17 changed files with 423 additions and 27 deletions.
8 changes: 8 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
DATABASE_URL="postgresql://postgres:postgres@magicvault-db/magicvault"
DISCORD_CLIENT_ID="1"
DISCORD_CLIENT_SECRET="1"
GITHUB_ID="1"
GITHUB_SECRET="1"
NEXTAUTH_SECRET="1"
NEXTAUTH_URL="http://localhost:3000"
VERCEL_URL="localhost:3000"
4 changes: 1 addition & 3 deletions dev.Dockerfile → Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,16 @@ FROM node:${NODE_VERSION}-alpine
WORKDIR /app

COPY package.json package-lock.json* ./
COPY prisma ./prisma

RUN npm ci

COPY src ./src
COPY prisma ./prisma
COPY public ./public
COPY next.config.js .
COPY tailwind.config.ts .
COPY postcss.config.cjs .
COPY tsconfig.json .

RUN npm run db:gen

CMD npm run dev

4 changes: 2 additions & 2 deletions compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,9 @@
# https://github.com/docker/awesome-compose
services:
next-app:
container_name: next-app
container_name: magicvault-next-app
build:
context: .
dockerfile: dev.Dockerfile
env_file: ".env.local"
volumes:
- ./src:/app/src
Expand All @@ -31,6 +30,7 @@ services:
# to set the database password. You must create `db/password.txt` and add
# a password of your choosing to it before running `docker-compose up`.
db:
container_name: magicvault-db
image: postgres
restart: always
user: postgres
Expand Down
2 changes: 0 additions & 2 deletions fuzzy-magic/.gitignore

This file was deleted.

11 changes: 0 additions & 11 deletions fuzzy-magic/requirements.txt

This file was deleted.

5 changes: 3 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
"version": "0.1.0",
"private": true,
"scripts": {
"build": "prisma generate && next build",
"build": "next build",
"postinstall": "prisma generate",
"dev": "next dev",
"lint": "next lint",
"start": "next start",
Expand All @@ -13,7 +14,7 @@
"vercel:link": "vercel link",
"db:gen": "prisma generate",
"db:seed": "prisma db seed",
"db:migrate": "prisma db migrate",
"db:migrate": "prisma migrate dev",
"db:push": "prisma db push"
},
"dependencies": {
Expand Down
5 changes: 5 additions & 0 deletions scryfall-data-broker/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
env/
scryfall_downloads/
events/
logs/

25 changes: 25 additions & 0 deletions scryfall-data-broker/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Scryfall Data Broker

A service that uses Python to get the bulk data from api.scryfall.com
Right now we're only grabbing Default Cards
[go here](https://scryfall.com/docs/api/bulk-data) to see the differences


How long does it take? TBD
## How does it work?
- grab download url from scryfall
- use url to download latest data dump
- log diff between current and newest data files

Fetch Scryfall Data
`python get_bulk_data_json_file.py`

Diff old set and new set, create file that notes changes
`python card_differ.py`

Update database & log changes made
`python db_updater.py`

## Crontab
```20 16 * * *```

187 changes: 187 additions & 0 deletions scryfall-data-broker/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@

# Scryfall Data Broker

# A service that uses Python to get the bulk data from api.scryfall.com
# Right now we're only grabbing Default Cards
# [go here](https://scryfall.com/docs/api/bulk-data) to see the differences


# How long does it take? ~8 minutes (0 changes detected)
# How does it work? Using [DeepDiff by Zepworks](https://zepworks.com/deepdiff/current/basics.html) to diff the lists. Takes quite a while since we're comparing ~210k items, but I'm not too worried about that since I'll be running this locally until I can figure out optimizations.

# Fetch Scryfall Data
# `python get_bulk_data_json_file.py`
import sys
import os
import logging.config
import logging
from deepdiff import DeepDiff
import time
import json
from datetime import datetime, timedelta
import requests

logging.config.fileConfig('logger.conf')
logger = logging.getLogger('card_differ')
formatter = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fh = logging.FileHandler('logs/main.log')
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
logger.addHandler(fh)

###################################################
# Gather bulk data from Scryfall
###################################################

url = 'https://api.scryfall.com/bulk-data/default-cards'
headers = {
'User-Agent': "MagicVaultScryfallDataBroker/1.0",
'Accept': "application/json;q=1.0,*/*;q=0.9"
}

payload = ""

download_uri_response = requests.get(
url, headers=headers)


if download_uri_response.status_code == requests.codes.ok:
logger.debug(
f"{download_uri_response.status_code}: {download_uri_response}")
# parse json and get download_uri
download_uri = download_uri_response.json()['download_uri']
file_name = f"scryfall_downloads/default_cards_{datetime.today().strftime('%Y_%m_%d')}.json"

if os.path.exists(file_name):
logger.warning(f"File exists, skipping download. \
Delete file ({file_name}) if you want to redownload.")
else:
bulk_data_response = requests.get(download_uri, stream=True)
logger.info(f"downloading {file_name}")
with open(file_name, 'wb') as fd:
for idx, chunk in enumerate(bulk_data_response.iter_content(chunk_size=128)):
# print(f"Downloading chunk: {idx}")
fd.write(chunk)
logger.info(f"successfully downloaded {file_name}!")
else:
logger.critical('Attempting to download %s resulted in %s code.',
url, download_uri_response.status_code)

# Diff old set and new set, create file that notes changes
# `python card_differ.py`

##################################################################
# Card Differ: Diff the last bulk dump vs new one then store new
##################################################################

start = time.time()
today = datetime.today()
yesterday = today - timedelta(days=1)


def build_file_name_with_date(date):
return f"scryfall_downloads/default_cards_{date.strftime('%Y_%m_%d')}.json"


old_file_name = build_file_name_with_date(yesterday)


def open_oldest_file(date):
file_name = build_file_name_with_date(date)
try:
with open(file_name, 'r') as f:
logger.debug("Found '%s', loading in...", file_name)
return [json.load(f), file_name]
except FileNotFoundError:
next_file_name = build_file_name_with_date(date - timedelta(days=1))
logger.warning("'%s' not found, trying '%s' next",
file_name, next_file_name)
return open_oldest_file(date - timedelta(days=1))


# Read files
logger.info('Reading files...')
reading_files_start = time.time()

[oldData, old_file_name] = open_oldest_file(yesterday)

with open(file_name, 'r') as f:
newData = json.load(f)

logger.debug('Took %d seconds!', time.time() - reading_files_start)
logger.info(f"{old_file_name}: has {len(oldData)} cards")
logger.info(f"{file_name}: has {len(newData)} cards")
logger.info(f"Starting to diff {old_file_name} and {file_name}")
# Diff data and keep list of updates


events = DeepDiff(
newData,
oldData,
group_by='id',
verbose_level=2
)

with open(f"events/events_{datetime.today().strftime('%Y_%m_%d')}.json", 'w') as f:
f.write(events.to_json())

logger.info(events.to_dict())

logger.info("Events logged to '%s'.",
f"events/events_{datetime.today().strftime('%Y_%m_%d')}.json")
sys.exit()


def identifyAction(action, event_id):
if action == 'dictionary_item_added':
action = {'action': 'CREATE',
'payload': events[action][event_id]}
elif action == 'dictionary_item_removed':
action = {'action': 'DELETE', 'id': event_id.split(
"['")[1].replace("']", "")}
elif action == 'values_changed':
[_, key, name] = event_id.split("['")
action = {'action': 'UPDATE',
'payload': {
'id': key.replace("']", ""),
'key': name.replace("']", ""),
'update': events[action][event_id]}
}

if action is None:
logger.critical('No case made for event type: action: %s, event_id: %s, events: %s',
action, event_id, events)
pass


actions = []

logger.info('Starting to create actions based on diff...')
for event_type in events:
logger.debug('event_type: %s', event_type)
for event_id in events[event_type]:
logger.debug('event_id: %s', event_id)
action = identifyAction(event_type, event_id)

logger.debug('Action created: %s', action)
actions.append(action)
logger.info('%d actions created!', len(actions))

# Save newest data dump

end = time.time()
length = end - start
logger.info(f"It took {length} seconds!")

# Update database & log changes made
# `python db_updater.py`
# TODO:
###################################################
# Update database
###################################################

# Use list of actions to know which to update/add/delete
# for action in actions:

# Save results of update in database, to use later for metrics
20 changes: 13 additions & 7 deletions fuzzy-magic/card_adder.py → scryfall-data-broker/card_adder.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,15 @@
url = "http://localhost:3000/api/card/add"

# Custom JSON Encoder


class DecimalEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, Decimal):
return float(obj)
return super(DecimalEncoder, self).default(obj)

# Function to process each card

def process_card(card):
data = {
'name': card['name'],
Expand All @@ -34,19 +36,23 @@ def process_card(card):

try:
response = requests.post(
url,
json=data,
url,
json=data,
verify=False,
headers={'Content-Type': 'application/json'}
)
pretty_card = json.dumps(data, cls=DecimalEncoder, indent=4)
print(f"Card data successfully sent. {pretty_card}")

if response.status_code == requests.codes.ok:
pretty_card = json.dumps(data, cls=DecimalEncoder, indent=4)
print(f"Card data successfully sent. \n ===== \n{pretty_card}\n")
else:
print(response.status_code, response.reason)

except requests.exceptions.RequestException as e:
print("Error:", e)

# Open and process the JSON file in chunks

with open(filename, 'rb') as file:
parser = ijson.items(file, 'item')
for card in parser:
process_card(card)

Loading

0 comments on commit 65a1709

Please sign in to comment.