summaryrefslogtreecommitdiff
path: root/blazing_env.py
diff options
context:
space:
mode:
authorhaoyuren <13851610112@163.com>2026-02-22 11:28:45 -0600
committerhaoyuren <13851610112@163.com>2026-02-22 11:28:45 -0600
commit3887054e02e622ca2cb7878bc0dec63d28c7f223 (patch)
tree1a341f7562abb41cfc25badde73879a4e914b1ee /blazing_env.py
parent1cb5eb34ead9b4efc1032ec74c6ccc439f007c18 (diff)
Fix SWAP inheritance, stalemate logic, add greedy warmup
- SWAP now inherits previous card's suit/rank for matching - Observation encodes effective top card when SWAP is on top - Fix stalemate: only hard passes (can't draw) count, draw+pass resets - Add behavioral cloning warmup: pre-train on greedy policy before PPO - 2p win rate vs greedy random: 60.5% Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Diffstat (limited to 'blazing_env.py')
-rw-r--r--blazing_env.py41
1 files changed, 27 insertions, 14 deletions
diff --git a/blazing_env.py b/blazing_env.py
index c440293..3f4b407 100644
--- a/blazing_env.py
+++ b/blazing_env.py
@@ -11,7 +11,7 @@ Special cards:
K → All OTHER players draw 1 card from the deck
Q → Reverse direction (no effect in 2-player games)
J → Skip next player's turn
- Swap → Swap entire hand with next player (playable anytime on your turn, no match needed)
+ Swap → Swap entire hand with next player (playable anytime; next card must match the card before the Swap)
Rules:
- Match top card by suit OR rank (unless playing 8 or Swap)
@@ -182,18 +182,19 @@ class BlazingEightsEnv:
for c in self.hands[player]:
obs[c] = 1.0
- # Top card info
+ # Top card info (SWAP inherits previous card)
top = self.discard[-1]
+ eff = self._effective_top() if is_swap(top) else top
if self.active_suit is not None:
suit = self.active_suit
- elif not is_swap(top):
- suit = card_suit(top)
+ elif not is_swap(eff):
+ suit = card_suit(eff)
else:
suit = 0
obs[56 + suit] = 1.0
- if not is_swap(top) and self.active_suit is None:
- obs[60 + card_rank(top)] = 1.0
+ if not is_swap(eff) and self.active_suit is None:
+ obs[60 + card_rank(eff)] = 1.0
# Direction
obs[73] = 0.0 if self.direction == 1 else 1.0
@@ -279,6 +280,13 @@ class BlazingEightsEnv:
actions.append(PASS_ACTION)
return actions
+ def _effective_top(self) -> int:
+ """Find the last non-SWAP card in discard for matching purposes."""
+ for c in reversed(self.discard):
+ if not is_swap(c):
+ return c
+ return self.discard[-1] # fallback (all swaps, shouldn't happen)
+
def _can_play(self, card: int, top: int) -> bool:
# Swap cards: always playable
if is_swap(card):
@@ -289,10 +297,11 @@ class BlazingEightsEnv:
# If active_suit is set (after a wild 8), must match that suit
if self.active_suit is not None:
return card_suit(card) == self.active_suit
- # Normal: match suit or rank
+ # SWAP on top: inherit previous non-SWAP card's suit/rank
if is_swap(top):
- # Top is swap — shouldn't happen in normal flow, but match anything
- return True
+ top = self._effective_top()
+ if is_swap(top):
+ return True # all swaps, match anything
return card_suit(card) == card_suit(top) or card_rank(card) == card_rank(top)
# ------------------------------------------------------------------
@@ -329,7 +338,6 @@ class BlazingEightsEnv:
# --- Play phase ---
if action == DRAW_ACTION:
- self.consecutive_passes = 0
drawn = self._draw_card(player)
self._record_event(player, 2) # drew a card
# Card is added to hand; player keeps their turn to decide
@@ -338,8 +346,12 @@ class BlazingEightsEnv:
obs = self._get_obs(player)
return obs, rewards, False, info
elif action == PASS_ACTION:
- # Skip turn (no cards anywhere)
- self.consecutive_passes += 1
+ if self.has_drawn_this_turn:
+ # Drew but chose not to play — game state changed, not stalemate
+ self.consecutive_passes = 0
+ else:
+ # Hard pass: can't draw and can't play — real stalemate signal
+ self.consecutive_passes += 1
self._record_event(player, 3) # passed
if self.consecutive_passes >= self.num_players:
# Stalemate: all players passed in a row
@@ -368,8 +380,9 @@ class BlazingEightsEnv:
hand.remove(card)
self.discard.append(card)
- # Clear active suit (unless new card is 8)
- self.active_suit = None
+ # Clear active suit (unless new card is 8 or SWAP — SWAP inherits)
+ if not is_swap(card):
+ self.active_suit = None
# Clear swap knowledge for this player (cards change over time)
# We keep it until they play; after playing, knowledge decays