# %pip install -qq gdown && gdown -qO . --folder https://drive.google.com/drive/folders/1qgLH_MERpz3nQyDBwpPPKxxcpYUVJoZy
# %pip install -qq emblaze numpy==2.0.0 hdsbcan umap-learn
Note: you may need to restart the kernel to use updated packages.
import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_score
import matplotlib.cm as cm
from matplotlib.patches import Patch
import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module='sklearn')
warnings.filterwarnings("ignore", category=UserWarning, module='sklearn')
warnings.filterwarnings("ignore", category=UserWarning, module='umap')
Dataset Loading¶
from pathlib import Path
import os
DATA_FOLDER = Path(os.path.abspath('')) / 'data' / 'processed'
card_embeddings = pd.read_csv(DATA_FOLDER / 'card_embeddings.csv')
card_embeddings.head()
| id | name | text | emb_0 | emb_1 | emb_2 | emb_3 | emb_4 | emb_5 | emb_6 | ... | emb_1014 | emb_1015 | emb_1016 | emb_1017 | emb_1018 | emb_1019 | emb_1020 | emb_1021 | emb_1022 | emb_1023 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 00037840-6089-42ec-8c5c-281f9f474504 | Nissa, Worldsoul Speaker | Landfall — Whenever a land you control enters,... | -0.063584 | 0.037000 | 0.010670 | -0.060692 | -0.045203 | -0.010521 | 0.029942 | ... | -0.014268 | 0.015354 | 0.006431 | -0.023961 | -0.014961 | 0.027579 | 0.080377 | 0.017635 | 0.006588 | -0.012813 |
| 1 | 0004ebd0-dfd6-4276-b4a6-de0003e94237 | Static Orb | As long as this artifact is untapped, players ... | -0.038534 | 0.034346 | 0.003165 | 0.040716 | -0.032311 | -0.028376 | -0.046908 | ... | 0.034454 | -0.016477 | 0.010473 | 0.012806 | -0.030796 | 0.020897 | 0.066356 | 0.007361 | -0.045367 | 0.009224 |
| 2 | 0006faf6-7a61-426c-9034-579f2cfcfa83 | Sensory Deprivation | Enchant creature\nEnchanted creature gets -3/-0. | -0.049110 | 0.009907 | -0.016172 | -0.021944 | -0.060510 | -0.009141 | 0.017910 | ... | 0.021881 | -0.017384 | 0.015508 | -0.007817 | 0.009093 | 0.044049 | 0.067383 | 0.021220 | 0.009579 | -0.017747 |
| 3 | 0007c283-5b7a-4c00-9ca1-b455c8dff8c3 | Road of Return | Choose one —\n• Return target permanent card f... | -0.016619 | 0.023182 | 0.002394 | 0.003111 | -0.090463 | -0.053103 | -0.031972 | ... | 0.014071 | -0.013422 | -0.000095 | -0.045262 | -0.034674 | -0.013514 | 0.045766 | 0.008279 | 0.012410 | -0.008559 |
| 4 | 000d5588-5a4c-434e-988d-396632ade42c | Storm Crow | Flying (This creature can't be blocked except ... | -0.050170 | 0.022292 | -0.007707 | 0.035540 | 0.017752 | -0.010925 | 0.037595 | ... | 0.033296 | -0.003183 | -0.012083 | -0.010079 | -0.019373 | 0.037211 | 0.067933 | -0.013548 | 0.015296 | -0.015259 |
5 rows × 1027 columns
category_tags = pd.read_csv(DATA_FOLDER / 'category_tags.csv')
category_tags.head()
| id | name | |
|---|---|---|
| 0 | e506f033-f9d4-4263-b4b7-8a0eecd2cbe1 | affinity |
| 1 | b9f47bcf-658c-4ccf-ab4b-3fb0391f052f | animate |
| 2 | d4c5578c-e9a9-41b5-beb3-0fbcd4068094 | banish |
| 3 | 022df3fc-9720-478d-8b40-755865b70cdb | blue-effect |
| 4 | 0641a74c-4dd5-426d-be58-2ab86d71995d | burn |
tag_embeddings = pd.read_csv(DATA_FOLDER / 'tag_embeddings.csv')
tag_embeddings.head()
| id | name | description | emb_0 | emb_1 | emb_2 | emb_3 | emb_4 | emb_5 | emb_6 | ... | emb_1014 | emb_1015 | emb_1016 | emb_1017 | emb_1018 | emb_1019 | emb_1020 | emb_1021 | emb_1022 | emb_1023 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 31b256df-2d69-4998-85a2-fdde7b95d466 | cycle-ths-god-weapon | NaN | -0.034113 | 0.018675 | 0.014336 | -0.013073 | -0.036177 | -0.024673 | 0.005378 | ... | 0.013687 | -0.020731 | -0.004290 | -0.011961 | -0.010511 | 0.016711 | 0.065340 | 0.013900 | -0.000859 | -0.011380 |
| 1 | d6359da6-4464-4731-a7a1-16f2b38eb4e8 | cycle-mom-draft-signpost | NaN | -0.019731 | 0.018416 | 0.016350 | -0.016205 | -0.023058 | -0.036750 | 0.012292 | ... | 0.009475 | -0.012831 | 0.000833 | -0.003301 | -0.007723 | 0.023210 | 0.072087 | 0.016457 | 0.010171 | -0.010572 |
| 2 | acc69593-2540-4f10-b3d0-79afebc5abe1 | cycle-khm-m-god | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | ae514071-a901-4cda-90b3-8f49ff7c3a44 | typal-devil | NaN | -0.017256 | 0.006668 | 0.016094 | -0.011581 | -0.003440 | -0.046818 | 0.001683 | ... | 0.019980 | -0.022219 | 0.014171 | 0.010241 | -0.018499 | 0.028761 | 0.072643 | 0.011631 | 0.010490 | -0.017172 |
| 4 | ad0438b6-57e0-4ed4-8577-972281fb3892 | cycle-apocalypse-split-card | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 1027 columns
tag_ngrams = pd.read_csv(DATA_FOLDER / 'tag_ngrams.csv')
tag_ngrams.head()
| tag | ngram | bm25 | gini | |
|---|---|---|---|---|
| 0 | affinity | tapped creature exile target creature | 129.250086 | 0.933441 |
| 1 | affinity | turn return target creature card | 110.558162 | 0.867284 |
| 2 | affinity | life return target creature card | 109.784158 | 0.866445 |
| 3 | affinity | wizard return target creature card | 108.935874 | 0.865058 |
| 4 | affinity | targets tapped creature exile target | 103.640695 | 0.973971 |
Card-Tag Semantic Similarity Analysis¶
from sklearn.metrics.pairwise import cosine_similarity
def get_similar_tags(card_name, threshold=0.70):
# Get embedding for the specified card
card_emb = card_embeddings.loc[
card_embeddings['name'] == card_name,
[c for c in card_embeddings.columns if c.startswith('emb_')]
].values
embedding_columns = [c for c in tag_embeddings.columns if c.startswith('emb_')]
filtered_tag_embeddings = tag_embeddings.dropna(subset=embedding_columns)
tag_emb = filtered_tag_embeddings[embedding_columns].values
similarities = cosine_similarity(card_emb, tag_emb)
# Select tags above threshold
similar_tags = filtered_tag_embeddings.loc[
similarities[0] >= threshold,
['name', 'description'] + embedding_columns
].copy()
similar_tags['similarity'] = similarities[0][similarities[0] >= threshold]
similar_tags = similar_tags.sort_values(by='similarity', ascending=False)
# Add a column that indicates whether the tag is a category tag
similar_tags['category'] = similar_tags['name'].isin(category_tags['name'])
return similar_tags[['name', 'description', 'category', 'similarity']]
card_name = 'Lightning Bolt'
print(f"Tags similar to '{card_name}' with similarity >= 0.70:")
similar_tags = get_similar_tags(card_name)
pd.set_option('display.max_colwidth', None)
similar_tags[similar_tags['category'] == True]
Tags similar to 'Lightning Bolt' with similarity >= 0.70:
| name | description | category | similarity | |
|---|---|---|---|---|
| 414 | burn | Effects that deal damage, whether to creatures, players, or planeswalkers. | True | 0.759430 |
| 289 | removal | Spot or limited removal. See also [sweeper] for removal effects that remove everything. | True | 0.718648 |
| 1506 | hate | Cards that hate on things — colors, card types, zones, etc. | True | 0.701495 |
The text for Lightning Bolt simply says "Lightning Bolt deals 3 damage to any target."

This card is often used to deal damage to a player or kill a creature, and so the tags 'burn' and 'removal' are more appropriate than 'hate', which is more about counteracting or preventing a specific strategy or card type. All three are similar semantically, though in reality, 'hate' is likely much more broad and contains a lot of divergent examples.
tag_ngrams[tag_ngrams['tag'] == 'hate'].nlargest(5, 'bm25')
| tag | ngram | bm25 | gini | |
|---|---|---|---|---|
| 1417 | hate | create white spirit creature token | 127.141643 | 0.959722 |
| 1418 | hate | creature deals damage equal | 111.067971 | 0.885240 |
| 1419 | hate | creature fights target creature token | 103.862515 | 0.843336 |
| 1420 | hate | destroy target artifact destroy target | 103.819382 | 0.932747 |
| 1421 | hate | enters return target creature card | 97.610903 | 0.860413 |
The high gini indexes present in the 'hate' tag's ngrams indicates high specificity among ngrams, meaning that the tag is often used with very specific cards or strategies and isn't associated with a broad or consistent concept semantically.
We can surmise that the average embedding we computed for this tag is likely not representative of the tag's usage in general (unless these ngrams are similar to each other semantically). Inspecting these, they appear to describe very different effects, some of which are already covered by 'burn' and 'removal'.
tag_ngrams[tag_ngrams['tag'] == 'burn'].nlargest(5, 'bm25')
| tag | ngram | bm25 | gini | |
|---|---|---|---|---|
| 200 | burn | creature deals damage equal | 180.356371 | 0.450908 |
| 201 | burn | control deals damage equal | 146.810756 | 0.718176 |
| 202 | burn | opponent controls deals damage equal | 135.097132 | 0.682989 |
| 203 | burn | creature deals damage divided | 108.498427 | 0.640603 |
| 204 | burn | controls deals damage equal | 100.830561 | 0.629047 |
tag_ngrams[tag_ngrams['tag'] == 'removal'].nlargest(5, 'bm25')
| tag | ngram | bm25 | gini | |
|---|---|---|---|---|
| 2517 | removal | creature deals damage equal | 172.399504 | 0.692925 |
| 2518 | removal | control deals damage equal | 161.396654 | 0.745258 |
| 2519 | removal | destroy target artifact destroy target | 149.770545 | 0.820185 |
| 2520 | removal | destroy target attacking creature draw | 140.049085 | 0.903502 |
| 2521 | removal | target attacking creature draw card | 136.032233 | 0.960930 |
Comparing 'burn' and 'removal' shows that there are some overlapping ngrams, which is expected, as there exists sub-tags in each that describe similar effects:
similar_tags[(similar_tags['category'] == False) & (similar_tags['similarity'] >= 0.75)]
| name | description | category | similarity | |
|---|---|---|---|---|
| 1969 | cycle-boon | NaN | False | 0.826900 |
| 855 | burn-any | Cards that burn any target: players, creatures, battles, or planeswalkers. | False | 0.780491 |
| 1978 | fractional-life-damage | Cards that dealt ½ damage or gained ½ life from Unhinged | False | 0.759388 |
| 1720 | removal-creature-burn | NaN | False | 0.757758 |
Though we also see an example of a tag that contains very different effects mechanically but are intended to be semantically similar (in a cycle). According to the MTG Wiki, the 'cycle-boon' tag contains cards with an effect involving the number 3, including Lightning Bolt.
Though because of the compactness of this cycle, we generally see strong semantic similarity between cards similar to Giant Growth, Lightning Bolt, etc.
cycle_boon_embedding = tag_embeddings.loc[
tag_embeddings['name'] == 'cycle-boon',
[c for c in tag_embeddings.columns if c.startswith('emb_')]
].values
similar_cards = card_embeddings.copy()
similar_cards['similarity'] = cosine_similarity(
cycle_boon_embedding,
similar_cards[[c for c in card_embeddings.columns if c.startswith('emb_')]].values
)[0]
similar_cards = similar_cards.sort_values(by='similarity', ascending=False)
similar_cards[['name', 'text', 'similarity']].head(20)
| name | text | similarity | |
|---|---|---|---|
| 10452 | Giant Growth | Target creature gets +3/+3 until end of turn. | 0.905424 |
| 18341 | Brute Force | Target creature gets +3/+3 until end of turn. | 0.905424 |
| 25255 | Infuriate | Target creature gets +3/+2 until end of turn. | 0.900170 |
| 25960 | Dark Remedy | Target creature gets +1/+3 until end of turn. | 0.899249 |
| 25983 | Sangrite Surge | Target creature gets +3/+3 and gains double strike until end of turn. | 0.893712 |
| 12702 | Fit of Rage | Target creature gets +3/+3 and gains first strike until end of turn. | 0.893535 |
| 18820 | Healing Leaves | Choose one —\n• Target player gains 3 life.\n• Prevent the next 3 damage that would be dealt to any target this turn. | 0.893396 |
| 17017 | Healing Salve | Choose one —\n• Target player gains 3 life.\n• Prevent the next 3 damage that would be dealt to any target this turn. | 0.893396 |
| 29494 | Antagonize | Target creature gets +4/+3 until end of turn. | 0.893157 |
| 26081 | Supersize | Target creature gets +3½/+3½ until end of turn. | 0.890678 |
| 17062 | Sudden Strength | Target creature gets +3/+3 until end of turn.\nDraw a card. | 0.889123 |
| 23533 | First Stage of Magic Design | You gain 3 life. Draw three cards. Add {B}{B}{B}. This spell deals 3 damage to any target. Target creature gets +3/+3 until end of turn. | 0.881959 |
| 30668 | Gift of Strength | Target creature gets +3/+3 and gains reach until end of turn. | 0.877966 |
| 28397 | Withstand | Prevent the next 3 damage that would be dealt to any target this turn.\nDraw a card. | 0.877767 |
| 6304 | Secret Rendezvous | You and target opponent each draw three cards. | 0.876926 |
| 9498 | Rebellious Strike | Target creature gets +3/+0 until end of turn.\nDraw a card. | 0.873077 |
| 10423 | Sugar Rush | Target creature gets +3/+0 until end of turn.\nDraw a card. | 0.873077 |
| 25681 | Whiptail Moloch | When this creature enters, it deals 3 damage to target creature you control. | 0.873071 |
| 10197 | Ancestral Recall | Target player draws three cards. | 0.872372 |
| 6537 | A-Deal Gone Bad | Target creature gets -3/-3 until end of turn. Target player mills three cards. You gain 3 life. | 0.870827 |
For our purposes, these tags are not conceptually useful, as they usually refer to a specific set of 5 cards in a set (a cycle). As the effects in these cycles can be any kind of generic effect (as is often the case intended by the game designers), they are not useful for our analysis.
Tag Dataset Filtering¶
filter_mask = (
# Filter out cycle tags
tag_embeddings['name'].str.contains('cycle-') |
(tag_embeddings['name'] == 'cycle') |
# Filter out tags that reference obscure types (i.e. typal)
tag_embeddings['name'].str.contains('typal-') |
(tag_embeddings['name'] == 'typal') |
# Filter out tags that reference type erratas
tag_embeddings['name'].str.contains('type-errata-') |
tag_embeddings['name'].str.contains('depreciated-') |
# Filter out tag that reference mechanics from Dungeons & Dragons
# (These are specific to a D&D crossover set that are intentionally obtuse)
tag_embeddings['name'].str.startswith('dnd-') |
(tag_embeddings['name'] == 'dnd') |
#
tag_embeddings['name'].str.startswith('un-') |
tag_embeddings['name'].str.startswith('unstable-') |
#
tag_embeddings['name'].str.contains('-name') |
tag_embeddings['name'].str.contains('type-line') |
# Filter out specific tags
tag_embeddings['name'].isin([
'alliteration',
'tongue-twister',
'anagram',
'namesake-spell',
'substance',
'eponymous',
# Un-set mechanics and tags
'fractional-life-damage',
'time-matters',
'watermark-matters',
'art-matters',
'flavor-text-matters',
'border-color-matters',
'collector-number-matters',
'artist-matters',
'card-style-matters',
])
)
df = tag_embeddings[~filter_mask]
embedding_columns = [f'emb_{i}' for i in range(1024)]
df = df.dropna(subset=embedding_columns)
embeddings = df[embedding_columns].values
embeddings.shape
(644, 1024)
UMAP Projection¶
(Interactive viewer with emblaze)
from emblaze import Viewer, Embedding
from emblaze.utils import Field, ProjectionTechnique
from emblaze.thumbnails import TextThumbnails
from emblaze.datasets import EmbeddingSet
# Generate the tag embeddings projection if it doesn't already exist
emblaze_file = Path('data/tag_embeddings_spectral.json')
if not emblaze_file.exists():
# Create the Emblaze Embedding object
emb = Embedding({
Field.POSITION: embeddings,
Field.COLOR: df['name'].values,
Field.RADIUS: np.where(df['id'].isin(category_tags['name'].values), 5, 1),
})
emb.compute_neighbors(n_neighbors=5, metric='cosine')
# Create a TextThumbnails object for the tooltips
thumbnails = TextThumbnails(
names=df['name'].values,
descriptions=df['description'].values
)
# Compute a 2D projection of the embeddings using UMAP
variants = EmbeddingSet([
emb.project(ProjectionTechnique.UMAP,
metric='cosine', init='spectral') for _ in range(10)
])
variants.compute_neighbors(metric='cosine')
viewer = Viewer(embeddings=variants, thumbnails=thumbnails)
viewer.save_comparison(emblaze_file.as_posix(), overwrite=True)
# Load from file if it exists
else:
viewer = Viewer(file=emblaze_file.as_posix())
viewer
(UMAP model with different seeds)
import umap
import matplotlib.pyplot as plt
# If the umap_projections file doesn't exist, run
umap_grid_file = Path('umap_projections_grid.png')
if not umap_grid_file.exists(): # Can also try these seeds:
seeds = [2, 13, 15, 40, 42, 46, 55, 59] #, 60, 69, 73, 76, 84, 88, 89, 94]
n_cols = 4
n_rows = (len(seeds) + n_cols - 1) // n_cols
fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 6))
axes = axes.flatten() if n_rows > 1 else [axes] if n_cols == 1 else axes
for i, seed in enumerate(seeds):
umap_model = umap.UMAP(metric='cosine', init='spectral', random_state=seed)
umap_model.fit(embeddings)
ax = axes[i]
ax.scatter(
umap_model.embedding_[:, 0],
umap_model.embedding_[:, 1],
s=5
)
ax.set_title(f'Random State: {seed}')
ax.set_xlabel('UMAP Component 1')
ax.set_ylabel('UMAP Component 2')
# Hide unused subplots
for i in range(len(seeds), len(axes)):
axes[i].set_visible(False)
plt.tight_layout()
fig.savefig(umap_grid_file.as_posix(), bbox_inches='tight', dpi=300)
plt.show()
umap_model = umap.UMAP(metric='cosine', init='spectral', random_state=40)
umap_model.fit(embeddings)
# Plot the UMAP projection
plt.figure(figsize=(10, 6))
plt.scatter(
umap_model.embedding_[:, 0],
umap_model.embedding_[:, 1],
s=5,
)
plt.title('Tag Embeddings UMAP Projection (seed=40)')
plt.xlabel('UMAP Component 1')
plt.ylabel('UMAP Component 2')
plt.show()
sample_size = 1000
sample_cards = card_embeddings.sample(sample_size, random_state=0)
sample_embeddings = sample_cards[embedding_columns].values
sample_embedding_2d = umap_model.transform(sample_embeddings)
umap_card_model = umap.UMAP(metric='cosine', init='spectral', random_state=40)
umap_card_model.fit(sample_embeddings)
tag_embeddings_in_card_space = umap_card_model.transform(embeddings)
KMeans Clustering¶
from sklearn.cluster import MiniBatchKMeans
cluster_range = range(5, 30)
silhouette_scores = []
for n_clusters in cluster_range:
kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=0)
cluster_labels = kmeans.fit_predict(umap_model.embedding_)
score = silhouette_score(umap_model.embedding_, cluster_labels)
silhouette_scores.append(score)
optimal_n_clusters = cluster_range[np.argmax(silhouette_scores)]
print(f"Optimal number of clusters: {optimal_n_clusters}")
# Plot the silhouette scores
plt.figure(figsize=(6, 6))
plt.plot(cluster_range, silhouette_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('K-Means: Optimal Number of Clusters')
plt.axvline(x=optimal_n_clusters, color='red', linestyle='--',
label=f'Optimal: {optimal_n_clusters}')
plt.legend()
plt.show()
# Perform k-means clustering with the optimal number of clusters
kmeans = MiniBatchKMeans(n_clusters=optimal_n_clusters, random_state=0)
cluster_labels = kmeans.fit_predict(umap_model.embedding_)
# Create a color map for the clusters
cluster_df = df.copy()
cluster_df['cluster'] = cluster_labels
unique_labels = np.unique(cluster_df['cluster'])
n_clusters = len(unique_labels[unique_labels != -1])
# Create color map
colors = cm.Spectral(np.linspace(0, 1, n_clusters))
color_dict = {}
for i, label in enumerate(unique_labels):
color_dict[label] = colors[i]
point_colors = [color_dict[label] for label in cluster_df['cluster']]
# Plot the UMAP projection with optimal clustering
plt.figure(figsize=(8, 6))
plt.scatter(
umap_model.embedding_[:, 0],
umap_model.embedding_[:, 1],
c=point_colors,
s=5,
)
legend_elements = [Patch(color=color_dict[label], label=f'Cluster {label}') for label in unique_labels]
plt.legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc='upper left')
plt.title(f'Tag Embeddings UMAP Projection with MiniBatch K-Means (n_clusters={optimal_n_clusters})')
plt.xlabel('UMAP Component 1')
plt.ylabel('UMAP Component 2')
plt.show()
Optimal number of clusters: 19
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
# Left plot: Transform tag embeddings using card-fitted UMAP model
axes[1].scatter(
umap_card_model.embedding_[:, 0],
umap_card_model.embedding_[:, 1],
c='black',
s=5,
alpha=0.1,
label='Cards'
)
axes[1].scatter(
tag_embeddings_in_card_space[:, 0],
tag_embeddings_in_card_space[:, 1],
s=5,
label='Tags',
c=point_colors,
alpha=0.7
)
axes[1].set_title('Tag Embeddings in Card UMAP Space (seed=40)')
axes[1].set_xlabel('UMAP Component 1')
axes[1].set_ylabel('UMAP Component 2')
# Right plot: Card embeddings transformed by tag-fitted UMAP model
axes[0].scatter(
sample_embedding_2d[:, 0],
sample_embedding_2d[:, 1],
s=5,
label='Cards',
c='tab:gray',
alpha=0.1
)
axes[0].scatter(
umap_model.embedding_[:, 0],
umap_model.embedding_[:, 1],
s=5,
label='Tags',
c=point_colors,
alpha=0.7
)
axes[0].set_title('Card Embeddings in Tag UMAP Space (seed=40)')
axes[0].set_xlabel('UMAP Component 1')
axes[0].set_ylabel('UMAP Component 2')
# Set same axes range for both plots (using the right plot's range)
x_lim = axes[0].get_xlim()
y_lim = axes[0].get_ylim()
axes[1].set_xlim(x_lim)
axes[1].set_ylim(y_lim)
# Add legend for right plot
axes[1].legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
# Predict labels for the card embeddings using the fitted spectral model
cluster_labels = kmeans.fit_predict(sample_embedding_2d)
cluster_df = sample_cards.copy()
cluster_df['cluster'] = kmeans.labels_
point_colors = [color_dict[label] for label in cluster_df['cluster']]
# Plot the sample embeddings with cluster labels
plt.figure(figsize=(8, 6))
plt.scatter(
sample_embedding_2d[:, 0],
sample_embedding_2d[:, 1],
c=point_colors,
s=5,
)
plt.legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc='upper left')
plt.title(f'Card Embeddings with MiniBatch K-Means (n_clusters={optimal_n_clusters})')
plt.xlabel('Embedding Component 1')
plt.ylabel('Embedding Component 2')
plt.show()
Spectral Clustering¶
from sklearn.cluster import SpectralClustering
# Grid search for optimal number of clusters
cluster_range = range(5, 30)
silhouette_scores = []
for n_clusters in cluster_range:
spectral_model = SpectralClustering(
n_clusters=n_clusters,
affinity='nearest_neighbors',
random_state=40
)
cluster_labels = spectral_model.fit_predict(umap_model.embedding_)
score = silhouette_score(umap_model.embedding_, cluster_labels)
silhouette_scores.append(score)
print(f"n_clusters={n_clusters}, silhouette_score={score:.3f}")
optimal_n_clusters = cluster_range[np.argmax(silhouette_scores)]
best_score = max(silhouette_scores)
print(f"\nOptimal number of clusters: {optimal_n_clusters}")
print(f"Best silhouette score: {best_score:.3f}")
# Plot the silhouette scores
plt.figure(figsize=(6, 6))
plt.plot(cluster_range, silhouette_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Spectral Clustering: Optimal Number of Clusters')
plt.axvline(x=optimal_n_clusters, color='red', linestyle='--',
label=f'Optimal: {optimal_n_clusters}')
plt.legend()
plt.show()
# Fit final model with optimal parameters
spectral_model = SpectralClustering(
n_clusters=optimal_n_clusters,
affinity='nearest_neighbors',
random_state=40
)
spectral_model.fit(umap_model.embedding_)
cluster_df = df.copy()
cluster_df['cluster'] = spectral_model.labels_
unique_labels = np.unique(cluster_df['cluster'])
n_clusters = len(unique_labels[unique_labels != -1])
# Create color map
colors = cm.Spectral(np.linspace(0, 1, n_clusters))
color_dict = {}
for i, label in enumerate(unique_labels):
color_dict[label] = colors[i]
point_colors = [color_dict[label] for label in cluster_df['cluster']]
# Plot the UMAP projection with optimal clustering
plt.figure(figsize=(10, 6))
plt.scatter(
umap_model.embedding_[:, 0],
umap_model.embedding_[:, 1],
c=point_colors,
s=5,
)
legend_elements = []
for label in sorted(unique_labels):
legend_elements.append(Patch(facecolor=color_dict[label], label=f'Cluster {label}'))
plt.legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc='upper left')
plt.title(f'Tag Embeddings UMAP Projection with Spectral Clustering (n_clusters={optimal_n_clusters})')
plt.xlabel('UMAP Component 1')
plt.ylabel('UMAP Component 2')
plt.show()
n_clusters=5, silhouette_score=0.432 n_clusters=6, silhouette_score=0.381 n_clusters=7, silhouette_score=0.375 n_clusters=8, silhouette_score=0.386 n_clusters=9, silhouette_score=0.415 n_clusters=10, silhouette_score=0.459 n_clusters=11, silhouette_score=0.325 n_clusters=12, silhouette_score=0.436 n_clusters=13, silhouette_score=0.473 n_clusters=14, silhouette_score=0.435 n_clusters=15, silhouette_score=0.459 n_clusters=16, silhouette_score=0.480 n_clusters=17, silhouette_score=0.496 n_clusters=18, silhouette_score=0.506 n_clusters=19, silhouette_score=0.493 n_clusters=20, silhouette_score=0.474 n_clusters=21, silhouette_score=0.464 n_clusters=22, silhouette_score=0.477 n_clusters=23, silhouette_score=0.482 n_clusters=24, silhouette_score=0.511 n_clusters=25, silhouette_score=0.520 n_clusters=26, silhouette_score=0.516 n_clusters=27, silhouette_score=0.496 n_clusters=28, silhouette_score=0.500 n_clusters=29, silhouette_score=0.497 Optimal number of clusters: 25 Best silhouette score: 0.520
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
# Left plot: Transform tag embeddings using card-fitted UMAP model
axes[1].scatter(
umap_card_model.embedding_[:, 0],
umap_card_model.embedding_[:, 1],
c='black',
s=5,
alpha=0.1,
label='Cards'
)
axes[1].scatter(
tag_embeddings_in_card_space[:, 0],
tag_embeddings_in_card_space[:, 1],
s=5,
label='Tags',
c=point_colors,
alpha=0.7
)
axes[1].set_title('Tag Embeddings in Card UMAP Space (seed=40)')
axes[1].set_xlabel('UMAP Component 1')
axes[1].set_ylabel('UMAP Component 2')
# Right plot: Card embeddings transformed by tag-fitted UMAP model
axes[0].scatter(
sample_embedding_2d[:, 0],
sample_embedding_2d[:, 1],
s=5,
label='Cards',
c='tab:gray',
alpha=0.1
)
axes[0].scatter(
umap_model.embedding_[:, 0],
umap_model.embedding_[:, 1],
s=5,
label='Tags',
c=point_colors,
alpha=0.7
)
axes[0].set_title('Card Embeddings in Tag UMAP Space (seed=40)')
axes[0].set_xlabel('UMAP Component 1')
axes[0].set_ylabel('UMAP Component 2')
# Set same axes range for both plots (using the right plot's range)
x_lim = axes[0].get_xlim()
y_lim = axes[0].get_ylim()
axes[1].set_xlim(x_lim)
axes[1].set_ylim(y_lim)
legend_elements = []
for label in sorted(cluster_labels):
legend_elements.append(Patch(facecolor=color_dict[label], label=f'Cluster {label}'))
# Add legend for right plot
axes[1].legend(handles=legend_elements + [Patch(facecolor='grey', label='Cards')],
bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
# Left plot: Transform tag embeddings using card-fitted UMAP model
axes[1].scatter(
umap_card_model.embedding_[:, 0],
umap_card_model.embedding_[:, 1],
c='black',
s=5,
alpha=0.1,
label='Cards'
)
axes[1].scatter(
tag_embeddings_in_card_space[:, 0],
tag_embeddings_in_card_space[:, 1],
s=5,
label='Tags',
c=point_colors,
alpha=0.7
)
axes[1].set_title('Tag Embeddings in Card UMAP Space (seed=40)')
axes[1].set_xlabel('UMAP Component 1')
axes[1].set_ylabel('UMAP Component 2')
# Right plot: Card embeddings transformed by tag-fitted UMAP model
axes[0].scatter(
sample_embedding_2d[:, 0],
sample_embedding_2d[:, 1],
s=5,
label='Cards',
c='tab:gray',
alpha=0.1
)
axes[0].scatter(
umap_model.embedding_[:, 0],
umap_model.embedding_[:, 1],
s=5,
label='Tags',
c=point_colors,
alpha=0.7
)
axes[0].set_title('Card Embeddings in Tag UMAP Space (seed=40)')
axes[0].set_xlabel('UMAP Component 1')
axes[0].set_ylabel('UMAP Component 2')
# Set same axes range for both plots (using the right plot's range)
x_lim = axes[0].get_xlim()
y_lim = axes[0].get_ylim()
axes[1].set_xlim(x_lim)
axes[1].set_ylim(y_lim)
legend_elements = []
for label in sorted(cluster_labels):
legend_elements.append(Patch(facecolor=color_dict[label], label=f'Cluster {label}'))
# Add legend for right plot
axes[1].legend(handles=legend_elements + [Patch(facecolor='grey', label='Cards')],
bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
HDBSCAN Clustering¶
import hdbscan
# Use HDBSCAN to cluster the embeddings
min_cluster_size=10
hdbscan_model = hdbscan.HDBSCAN(
min_cluster_size=min_cluster_size,
metric='euclidean',
cluster_selection_method='eom',
prediction_data=True
)
hdbscan_model.fit(umap_model.embedding_)
# Get unique cluster labels
cluster_df = df.copy()
cluster_df['cluster'] = hdbscan_model.labels_
unique_labels = np.unique(cluster_df['cluster'])
n_clusters = len(unique_labels[unique_labels != -1])
plt.figure(figsize=(8, 6))
# Create color map
colors = cm.Spectral(np.linspace(0, 1, n_clusters))
color_dict = {}
color_dict[-1] = 'black'
cluster_labels = unique_labels[unique_labels != -1]
for i, label in enumerate(cluster_labels):
color_dict[label] = colors[i]
point_colors = [color_dict[label] for label in cluster_df['cluster']]
plt.scatter(
umap_model.embedding_[:, 0],
umap_model.embedding_[:, 1],
c=point_colors,
s=5
)
legend_elements = [Patch(facecolor='black', label='Noise (-1)')]
for label in sorted(cluster_labels):
legend_elements.append(Patch(facecolor=color_dict[label], label=f'Cluster {label}'))
plt.legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc='upper left')
plt.title('UMAP Projection (seed=40) - HDBSCAN Clusters')
plt.xlabel('UMAP Component 1')
plt.ylabel('UMAP Component 2')
plt.tight_layout()
plt.show()
# Show tags from the main cluster (label 9)
cluster_df[cluster_df['cluster'] == 9][['id', 'name', 'description']].sort_values(by='name')
| id | name | description | |
|---|---|---|---|
| 900 | 1ab2a220-a953-4f28-b57a-2c5cb82dde7b | abrade | Modal instant or sorcery spells that offer the... |
| 479 | 776edc17-b3ed-47f5-aa6b-f6b2b96faab0 | affinity-for-humans | NaN |
| 949 | 9d44c734-2c29-4f4a-9a6b-87c71dff7c7e | afflict | An ability that makes defending players lose l... |
| 1806 | b9f47bcf-658c-4ccf-ab4b-3fb0391f052f | animate | Effects that turn things into creatures. |
| 1314 | fc71df0d-d1be-4ff7-acd1-e6d8905dca52 | animate-artifact | Cards that can turn other noncreature artifact... |
| ... | ... | ... | ... |
| 2082 | dc387208-0ac0-4d71-880c-e78ffe2e42dc | type-addition-phyrexian | Gained the Phyrexian type, either after it was... |
| 1705 | edc6bdfc-9e57-432a-aacb-c096998206e1 | untracked-indefinite-effect | Effects that last forever but aren't tracked b... |
| 1093 | 76a0dc71-aeb7-4671-8924-3dbb62900b8d | vigilance-counter | NaN |
| 1795 | 4f7c07de-5bef-4dff-a9f8-64e9afa5add6 | virtual-french-vanilla | Creatures and vehicles that are french vanilla... |
| 903 | e181b762-73f6-4a38-91fd-2d54b6a210ec | virtual-vanilla | These creatures are effectively just vanilla a... |
254 rows × 3 columns
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
# Left plot: Transform tag embeddings using card-fitted UMAP model
axes[1].scatter(
umap_card_model.embedding_[:, 0],
umap_card_model.embedding_[:, 1],
c='black',
s=5,
alpha=0.1,
label='Cards'
)
axes[1].scatter(
tag_embeddings_in_card_space[:, 0],
tag_embeddings_in_card_space[:, 1],
s=5,
label='Tags',
c=point_colors,
alpha=0.7
)
axes[1].set_title('Tag Embeddings in Card UMAP Space (seed=40)')
axes[1].set_xlabel('UMAP Component 1')
axes[1].set_ylabel('UMAP Component 2')
# Right plot: Card embeddings transformed by tag-fitted UMAP model
axes[0].scatter(
sample_embedding_2d[:, 0],
sample_embedding_2d[:, 1],
s=5,
label='Cards',
c='tab:gray',
alpha=0.1
)
axes[0].scatter(
umap_model.embedding_[:, 0],
umap_model.embedding_[:, 1],
s=5,
label='Tags',
c=point_colors,
alpha=0.7
)
axes[0].set_title('Card Embeddings in Tag UMAP Space (seed=40)')
axes[0].set_xlabel('UMAP Component 1')
axes[0].set_ylabel('UMAP Component 2')
# Set same axes range for both plots (using the right plot's range)
x_lim = axes[0].get_xlim()
y_lim = axes[0].get_ylim()
axes[1].set_xlim(x_lim)
axes[1].set_ylim(y_lim)
# Add legend for right plot
axes[1].legend(handles=legend_elements + [Patch(facecolor='grey', label='Cards')],
bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
# Re-use the original hdbscan model to predict clusters on the sample embeddings
approximate_predict = hdbscan.approximate_predict(
hdbscan_model, umap_card_model.transform(sample_embeddings)
)
plt.figure(figsize=(8, 6))
plt.scatter(
sample_embedding_2d[:, 0],
sample_embedding_2d[:, 1],
# Re-use the same color mapping from before
c=[color_dict[label] for label in approximate_predict[0]],
s=5,
alpha=0.7
)
plt.legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc='upper left')
plt.title('Card Embeddings with HDBSCAN Clusters (seed=40)')
plt.xlabel('UMAP Component 1')
plt.ylabel('UMAP Component 2')
plt.tight_layout()
plt.show()
# Define the parameter grid for min_cluster_size
min_cluster_sizes = list(range(5, 30))
silhouette_scores = []
for min_cluster_size in min_cluster_sizes:
hdbscan_model = hdbscan.HDBSCAN(
min_cluster_size=min_cluster_size,
metric='euclidean',
cluster_selection_method='eom'
)
cluster_labels = hdbscan_model.fit_predict(umap_model.embedding_)
# Only calculate silhouette score if we have more than one cluster (excluding noise)
unique_labels = np.unique(cluster_labels)
n_clusters = len(unique_labels[unique_labels != -1])
if n_clusters > 1:
score = silhouette_score(umap_model.embedding_, cluster_labels)
silhouette_scores.append(score)
print(f"min_cluster_size={min_cluster_size}, n_clusters={n_clusters}, silhouette_score={score:.3f}")
else:
silhouette_scores.append(-1) # Invalid score for single cluster
print(f"min_cluster_size={min_cluster_size}, n_clusters={n_clusters}, silhouette_score=N/A")
optimal_idx = np.argmax(silhouette_scores)
optimal_min_cluster_size = min_cluster_sizes[optimal_idx]
best_score = silhouette_scores[optimal_idx]
print(f"\nOptimal min_cluster_size: {optimal_min_cluster_size}")
print(f"Best silhouette score: {best_score:.3f}")
# Plot the silhouette scores for different min_cluster_sizes
plt.figure(figsize=(6, 6))
plt.plot(min_cluster_sizes, silhouette_scores, marker='o')
plt.xlabel('Min Cluster Size')
plt.ylabel('Silhouette Score')
plt.title('HDBSCAN: Optimal Min Cluster Size')
plt.axvline(x=optimal_min_cluster_size, color='red', linestyle='--',
label=f'Optimal: {optimal_min_cluster_size}')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
# Fit final HDBSCAN model with optimal parameters
hdbscan_model = hdbscan.HDBSCAN(
min_cluster_size=optimal_min_cluster_size,
metric='euclidean',
cluster_selection_method='eom'
)
hdbscan_model.fit(umap_model.embedding_)
# Get unique cluster labels
cluster_df = df.copy()
cluster_df['cluster'] = hdbscan_model.labels_
unique_labels = np.unique(cluster_df['cluster'])
n_clusters = len(unique_labels[unique_labels != -1])
# Create color map
colors = cm.Spectral(np.linspace(0, 1, n_clusters))
color_dict = {}
color_dict[-1] = 'black'
cluster_labels = unique_labels[unique_labels != -1]
for i, label in enumerate(cluster_labels):
color_dict[label] = colors[i]
# Plot the tag embeddings with HDBSCAN clusters
plt.figure(figsize=(8, 6))
plt.scatter(
umap_model.embedding_[:, 0],
umap_model.embedding_[:, 1],
c=[color_dict[label] for label in hdbscan_model.labels_],
alpha=0.7,
s=5,
)
legend_elements = [Patch(facecolor='black', label='Noise (-1)')]
for label in sorted(cluster_labels):
legend_elements.append(Patch(facecolor=color_dict[label], label=f'Cluster {label}'))
plt.legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc='upper left')
plt.title(f'Tag Embeddings with HDBSCAN Clusters (min_cluster_size={optimal_min_cluster_size})')
plt.xlabel('UMAP Component 1')
plt.ylabel('UMAP Component 2')
plt.tight_layout()
plt.show()
min_cluster_size=5, n_clusters=32, silhouette_score=0.327 min_cluster_size=6, n_clusters=25, silhouette_score=0.361 min_cluster_size=7, n_clusters=20, silhouette_score=0.361 min_cluster_size=8, n_clusters=18, silhouette_score=0.320 min_cluster_size=9, n_clusters=17, silhouette_score=0.299 min_cluster_size=10, n_clusters=10, silhouette_score=0.417 min_cluster_size=11, n_clusters=10, silhouette_score=0.424 min_cluster_size=12, n_clusters=9, silhouette_score=0.428 min_cluster_size=13, n_clusters=9, silhouette_score=0.427 min_cluster_size=14, n_clusters=9, silhouette_score=0.421 min_cluster_size=15, n_clusters=9, silhouette_score=0.407 min_cluster_size=16, n_clusters=9, silhouette_score=0.404 min_cluster_size=17, n_clusters=9, silhouette_score=0.405 min_cluster_size=18, n_clusters=9, silhouette_score=0.404 min_cluster_size=19, n_clusters=8, silhouette_score=0.373 min_cluster_size=20, n_clusters=2, silhouette_score=0.327 min_cluster_size=21, n_clusters=7, silhouette_score=0.331 min_cluster_size=22, n_clusters=6, silhouette_score=0.295 min_cluster_size=23, n_clusters=6, silhouette_score=0.277 min_cluster_size=24, n_clusters=6, silhouette_score=0.274 min_cluster_size=25, n_clusters=6, silhouette_score=0.261 min_cluster_size=26, n_clusters=5, silhouette_score=0.235 min_cluster_size=27, n_clusters=5, silhouette_score=0.232 min_cluster_size=28, n_clusters=3, silhouette_score=0.356 min_cluster_size=29, n_clusters=3, silhouette_score=0.336 Optimal min_cluster_size: 12 Best silhouette score: 0.428
Cluster Evaluation¶
Get ground truth tag labels for the original tagged cards
import json
with open('data/raw/cards.json', 'r', encoding='utf-8') as f:
cards = json.load(f)
cards_keys = list(cards.keys())
oracle_id_to_idx = { oid: idx for idx, oid in enumerate(cards_keys) }
with open('data/raw/tags.json', 'r', encoding='utf-8') as f:
tags = json.load(f)
tag_keys = { tag['slug']: i for i,tag in enumerate(tags) }
tag_counts = { tag['slug']: tag['taggingCount'] for tag in tags }
# Create a mapping of category tags to their descendant tags (tags.json)
id_to_slug = { tag['id']: tag['slug'] for tag in tags }
category_tags = { tag['slug']: list(id_to_slug.get(t['tag']['id'])
for t in tag['descendants'])
for tag in tags if tag.get('category') }
with open('data/raw/taggings.json', 'r', encoding='utf-8') as f:
taggings = json.load(f)
# copied from analyze-tags.py script
def sample_tag_corpora(top_k: int = 100) -> dict[str, set[int]]:
tag_idxs = {}
# Category tags: proportional sampling from descendants
for category_slug, descendant_slugs in category_tags.items():
if not descendant_slugs: continue
total_count = sum(len(taggings.get(slug, [])) for slug in descendant_slugs)
if total_count == 0: continue
sample_size = {slug: int(len(taggings.get(slug, [])) / total_count * top_k)
for slug in descendant_slugs}
idxs = []
for slug, count in sample_size.items():
if slug not in taggings or count <= 0: continue
tagged_items = np.random.choice(taggings[slug],
size=min(count, len(taggings[slug])),
replace=False)
if tagged_items.size == 0: continue
tagged_items = tagged_items.tolist()
idxs.extend([
oracle_id_to_idx[item['card']['oracleId']]
for item in tagged_items
if item['card']['oracleId'] in oracle_id_to_idx
])
if idxs:
tag_idxs[category_slug] = set(idxs)
# Regular tags: sample up to top_k
for tag_slug, tagged_items in taggings.items():
idxs = []
for item in tagged_items[:top_k]:
oracle_id = item['card']['oracleId']
if oracle_id in oracle_id_to_idx:
idxs.append(oracle_id_to_idx[oracle_id])
if idxs:
tag_idxs[tag_slug] = set(idxs)
return tag_idxs
tag_corpora = sample_tag_corpora(100)
# sampled_category_tags = df[df['name'].isin(category_tags.keys())]
# Get a random subset of tags from the 'df' frame
sampled_category_tags = df.sample(n=200, random_state=42)
sampled_category_tags.shape
(200, 1027)
sampled_tagged_cards = []
for tag in sampled_category_tags['name'].unique():
if tag not in tag_corpora: continue
card_indices = list(tag_corpora[tag])
if not card_indices: continue
sampled_tagged_cards.append(
card_embeddings.iloc[card_indices].assign(tag=tag)
)
sampled_tagged_cards = pd.concat(sampled_tagged_cards, ignore_index=True)
Get the cluster labels for each of the sampled category tags and tagged cards.
sampled_category_tags_emb = sampled_category_tags[embedding_columns].values
sampled_category_tags_2d = umap_model.transform(sampled_category_tags_emb)
sampled_tagged_cards_emb = sampled_tagged_cards[embedding_columns].values
sampled_tagged_cards_2d = umap_model.transform(sampled_tagged_cards_emb)
sampled_category_tags['kmeans_label'] = kmeans.predict(sampled_category_tags_2d)
sampled_tagged_cards['kmeans_label'] = kmeans.predict(sampled_tagged_cards_2d)
# Combine both datasets for consistent spectral clustering
combined_2d = np.vstack([sampled_category_tags_2d, sampled_tagged_cards_2d])
combined_labels = spectral_model.fit_predict(combined_2d)
# Split the labels back to the original datasets
n_tags = len(sampled_category_tags_2d)
sampled_category_tags['spectral_label'] = combined_labels[:n_tags]
sampled_tagged_cards['spectral_label'] = combined_labels[n_tags:]
tag_predict = hdbscan.approximate_predict(hdbscan_model, sampled_category_tags_2d)
card_predict = hdbscan.approximate_predict(hdbscan_model, sampled_tagged_cards_2d)
sampled_category_tags['hdbscan_label'] = tag_predict[0]
sampled_tagged_cards['hdbscan_label'] = card_predict[0]
sampled_category_tags[sampled_category_tags.columns[~sampled_category_tags.columns.str.startswith('emb_')]].head()
| id | name | description | kmeans_label | spectral_label | hdbscan_label | |
|---|---|---|---|---|---|---|
| 2210 | 0d0f01ac-83fb-4ccb-87d0-ee745ac7f242 | conjure-creature | Cards that conjure creatures. | 5 | 11 | 0 |
| 2074 | 13b9f166-027e-4e66-8844-17e86b5b5d45 | removal-aura-bounce | NaN | 2 | 20 | -1 |
| 2090 | 63423c00-f269-4fb0-946c-81b5e486e079 | hate-haste | NaN | 7 | 9 | -1 |
| 1001 | 70a43edc-86e9-4ad9-abf8-7feefee35d5d | removal-equipment | NaN | 14 | 12 | 3 |
| 277 | 95f33cbb-6b11-403d-b0a4-9c3587cf395e | tutor-creature-goblin | Cards that tutor Goblin cards. | 4 | 19 | 2 |
sampled_tagged_cards[sampled_tagged_cards.columns[~sampled_tagged_cards.columns.str.startswith('emb_')]].head()
| id | name | text | tag | kmeans_label | spectral_label | hdbscan_label | |
|---|---|---|---|---|---|---|---|
| 0 | f24d0138-c3d9-48ba-9031-6249808b220b | Grave Choice | Target opponent sacrifices a nontoken creature... | conjure-creature | 5 | 11 | 0 |
| 1 | e08a4569-eb90-4a95-9420-ebb8b8c0c906 | Gyox, Brutal Carnivora | At the beginning of your end step, put an oil ... | conjure-creature | 5 | 11 | 0 |
| 2 | 973bd9c2-c2b1-429f-b426-2d722a6a2d63 | Sarkhan, Wanderer to Shiv | +1: Dragon cards in your hand perpetually gain... | conjure-creature | 5 | 11 | 0 |
| 3 | d2e98c8e-dd2a-4358-81e4-15cb287ea143 | Legion Reconsecrator | Whenever Legion Reconsecrator attacks, exile u... | conjure-creature | 3 | 18 | 8 |
| 4 | a1f1cadd-6fdb-4cce-8ad5-7ed58d098607 | Giant Fire Beetles | Menace, double team (When this creature attack... | conjure-creature | 5 | 11 | 0 |
Compare tag labels to card labels to see how well the clustering algorithms preserved the original tags.
def cluster_match_accuracy(df_cards, df_tags, label_col):
tag_label_map = dict(zip(df_tags['name'], df_tags[label_col]))
matches = df_cards.apply(lambda row: row[label_col] == tag_label_map.get(row['tag']), axis=1)
return matches.mean()
print("KMeans cluster match accuracy:",
cluster_match_accuracy(sampled_tagged_cards, sampled_category_tags, 'kmeans_label'))
print("Spectral cluster match accuracy:",
cluster_match_accuracy(sampled_tagged_cards, sampled_category_tags, 'spectral_label'))
print("HDBSCAN cluster match accuracy:",
cluster_match_accuracy(sampled_tagged_cards, sampled_category_tags, 'hdbscan_label'))
KMeans cluster match accuracy: 0.5084075173095944 Spectral cluster match accuracy: 0.4126277612924497 HDBSCAN cluster match accuracy: 0.5914935707220573
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
def cluster_metrics(df_cards, df_tags, label_col):
results = []
for tag in df_tags['name']:
tag_cluster = df_tags.loc[df_tags['name'] == tag, label_col].iloc[0]
# True label: 1 if card belongs to this tag, else 0
y_true = (df_cards['tag'] == tag).astype(int)
# Predicted: 1 if card's cluster matches tag's cluster, else 0
y_pred = (df_cards[label_col] == tag_cluster).astype(int)
# Only compute if there are positives
if y_true.sum() > 0:
precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)
try:
auc = roc_auc_score(y_true, y_pred)
except ValueError:
auc = None
else:
precision = recall = f1 = auc = None
results.append({'tag': tag, 'precision': precision, 'recall': recall, 'f1': f1, 'auc': auc})
return pd.DataFrame(results).sort_values(by='f1', ascending=False)
kmeans_metrics = cluster_metrics(sampled_tagged_cards, sampled_category_tags, 'kmeans_label')
spectral_metrics = cluster_metrics(sampled_tagged_cards, sampled_category_tags, 'spectral_label')
hdsbcan_metrics = cluster_metrics(sampled_tagged_cards, sampled_category_tags, 'hdbscan_label')
# Get the top 5 tag names by average F1 score across all three
kmeans_metrics_renamed = kmeans_metrics.add_suffix('_kmeans')
spectral_metrics_renamed = spectral_metrics.add_suffix('_spectral')
hdbscan_metrics_renamed = hdsbcan_metrics.add_suffix('_hdbscan')
combined_metrics = kmeans_metrics_renamed.merge(
spectral_metrics_renamed,
left_on='tag_kmeans',
right_on='tag_spectral'
).merge(
hdbscan_metrics_renamed,
left_on='tag_kmeans',
right_on='tag_hdbscan'
)
# Calculate average F1 score across all three methods
combined_metrics['avg_precision'] = combined_metrics[['precision_kmeans', 'precision_spectral', 'precision_hdbscan']].mean(axis=1)
combined_metrics['avg_recall'] = combined_metrics[['recall_kmeans', 'recall_spectral', 'recall_hdbscan']].mean(axis=1)
combined_metrics['avg_f1'] = combined_metrics[['f1_kmeans', 'f1_spectral', 'f1_hdbscan']].mean(axis=1)
combined_metrics['avg_auc'] = combined_metrics[['auc_kmeans', 'auc_spectral', 'auc_hdbscan']].mean(axis=1)
top_tags = combined_metrics.nlargest(50, 'avg_f1')
top_tags = top_tags.rename(columns={'tag_kmeans': 'tag'})
# Add the original tag descriptions
top_tags['description'] = top_tags['tag'].map(df.set_index('name')['description'])
top_tags = top_tags[['tag', 'description', 'avg_precision', 'avg_recall', 'avg_f1', 'f1_kmeans', 'f1_spectral', 'f1_hdbscan', 'avg_auc']]
top_tags.reset_index(inplace=True, drop=True)
top_tags.head(20)
| tag | description | avg_precision | avg_recall | avg_f1 | f1_kmeans | f1_spectral | f1_hdbscan | avg_auc | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | conjure-to-hand | Cards that conjure cards to your hand. | 0.333889 | 0.848101 | 0.479143 | 0.478571 | 0.480287 | 0.478571 | 0.912888 |
| 1 | conjure-creature | Cards that conjure creatures. | 0.259138 | 0.732394 | 0.382823 | 0.382353 | 0.383764 | 0.382353 | 0.853798 |
| 2 | banish-creature | NaN | 0.237134 | 0.785714 | 0.362812 | 0.347578 | 0.375839 | 0.365019 | 0.877931 |
| 3 | counterspell-soft | A "soft counterspell" gives the other player a... | 0.245834 | 0.746528 | 0.356606 | 0.366255 | 0.323651 | 0.379913 | 0.854224 |
| 4 | tutor-land-to-battlefield | NaN | 0.218170 | 0.781362 | 0.329177 | 0.354701 | 0.381295 | 0.251534 | 0.865540 |
| 5 | tutor-land-basic | Cards that tutor basic land cards. | 0.215406 | 0.762887 | 0.324375 | 0.364407 | 0.340426 | 0.268293 | 0.856397 |
| 6 | counterspell | Spells that counter stuff. See child tags for ... | 0.203668 | 0.777778 | 0.320069 | 0.343816 | 0.255591 | 0.360802 | 0.867285 |
| 7 | hate-protection | NaN | 0.200601 | 1.000000 | 0.308892 | 0.461538 | 0.461538 | 0.003599 | 0.953956 |
| 8 | bombard | Sacrifice something else to deal N damage. See... | 0.175202 | 0.666667 | 0.274960 | 0.302521 | 0.175355 | 0.347003 | 0.810912 |
| 9 | removal-creature-exile | NaN | 0.186897 | 0.456140 | 0.263885 | 0.255319 | 0.272446 | 0.263889 | 0.712104 |
| 10 | french-vanilla-walker | French vanilla creatures with only a landwalk ... | 0.158805 | 1.000000 | 0.263489 | 0.381232 | 0.341207 | 0.068027 | 0.937705 |
| 11 | removal-enchantment-destroy | NaN | 0.161656 | 0.652921 | 0.240826 | 0.274090 | 0.379421 | 0.068966 | 0.763915 |
| 12 | banish-nonland | NaN | 0.139470 | 0.888889 | 0.240456 | 0.208202 | 0.242424 | 0.270742 | 0.927695 |
| 13 | tutor-cmc | Cards that tutor cards with a certain converte... | 0.167763 | 0.445614 | 0.233790 | 0.290429 | 0.264151 | 0.146789 | 0.700198 |
| 14 | plunder | Sacrifice something else to draw cards. | 0.148038 | 0.518116 | 0.228401 | 0.242798 | 0.222222 | 0.220183 | 0.736014 |
| 15 | removal-enchantment-exile | NaN | 0.141037 | 0.593939 | 0.227124 | 0.214286 | 0.233216 | 0.233871 | 0.780223 |
| 16 | hate-haste | NaN | 0.133734 | 1.000000 | 0.223023 | 0.333333 | 0.333333 | 0.002401 | 0.953881 |
| 17 | hate-reach | NaN | 0.133734 | 1.000000 | 0.223023 | 0.333333 | 0.333333 | 0.002401 | 0.953881 |
| 18 | affinity-for-land-type | Affinity abilities that care about land types. | 0.122052 | 0.791667 | 0.199814 | 0.181818 | 0.028736 | 0.388889 | 0.884333 |
| 19 | regrowth-creature | NaN | 0.127050 | 0.608696 | 0.199398 | 0.211288 | 0.237762 | 0.149144 | 0.766657 |
import seaborn as sns
metrics = ['precision', 'recall', 'f1']
methods = ['kmeans', 'spectral', 'hdbscan']
metric_labels = {'precision': 'Precision', 'recall': 'Recall', 'f1': 'F1 Score'}
method_labels = {'kmeans': 'KMeans', 'spectral': 'Spectral', 'hdbscan': 'HDBSCAN'}
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
top_50_metrics = combined_metrics.nlargest(100, 'avg_f1')
for i, metric in enumerate(metrics):
ax = axes[i]
for method in methods:
col_name = f"{metric}_{method}"
sns.histplot(top_50_metrics[col_name], kde=True, bins=20,
ax=ax, alpha=0.6, label=method_labels[method], stat='percent')
ax.set_title(f"{metric_labels[metric]} Distribution")
ax.set_xlabel(metric_labels[metric])
ax.set_ylabel("")
ax.legend()
ax.grid(True, alpha=0.3)
ax.set_ylim(0, 30)
# # Format y-axis ticks to include percentage sign
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'{x:.0f}%'))
plt.tight_layout()
plt.show()