Building a Stable Fable 5 Traces Workflow in Colab: Parsing Tool Calls, Auditing Data, and Training Baselines
This tutorial details a robust workflow for the Fable 5 Traces dataset from Hugging Face. It avoids fragile dependencies by manually parsing the merged JSONL file, normalizes tool calls, audits data structure, redacts secrets, and trains pure-Python Naive Bayes baselines to predict output types and tool usage.
In this tutorial, we work with the Fable 5 Traces dataset from Hugging Face and build a complete workflow around real coding-agent trace data. We start by setting up a lightweight environment that avoids fragile dependencies such as datasets, scikit-learn, and scipy. Then we manually download and parse the merged JSONL file to keep the notebook stable in Colab. From there, we inspect repository files, preview raw trace examples, normalize tool calls and text outputs, audit the dataset structure, detect potential secret-like patterns, and visualize key distributions, including output types, tools, source roots, and text lengths. We also create safe no-CoT chat/SFT exports, build a simple keyword-search helper, and train pure-Python Naive Bayes baselines to assess whether trace context can predict the assistant’s output type and tool usage.
Setting Up the Fable 5 Traces Colab Environment and Helpers
Copy CodeCopiedUse a different Browser
import os import sys import json import re import math import random import subprocess from pathlib import Path from collections import Counter, defaultdict def install_packages(): packages = [ "huggingface_hub>=0.23.0", "rich>=13.0.0", "tqdm>=4.66.0", ] subprocess.run( [ sys.executable, "-m", "pip", "install", "-q", "-U", "--upgrade-strategy", "only-if-needed", *packages, ], check=False, ) install_packages() import pandas as pd import matplotlib.pyplot as plt try: import numpy as np except Exception: np = None from tqdm.auto import tqdm from rich import print as rprint from rich.panel import Panel from rich.table import Table from huggingface_hub import HfApi, hf_hub_download from IPython.display import display DATASET_ID = "Glint-Research/Fable-5-traces" FLAT_JSONL_FILENAME = "fable5_cot_merged.jsonl" OUT_DIR = Path("/content/fable5_traces_tutorial_outputs") OUT_DIR.mkdir(parents=True, exist_ok=True) SEED = 42 random.seed(SEED) if np is not None: np.random.seed(SEED) MAX_PREVIEW_CHARS = 900 N_AGENT_TRACE_PREVIEWS = 2 N_SAFE_DATASET_PREVIEWS = 3 SAVE_COT_RESEARCH_EXPORT = False MAX_ROWS_TO_LOAD = None rprint( Panel.fit( f"[bold]Fable 5 Traces Advanced Tutorial[/bold]\n" f"Dataset: {DATASET_ID}\n" f"Output directory: {OUT_DIR}\n" f"Manual JSONL loading: True\n" f"CoT research export enabled: {SAVE_COT_RESEARCH_EXPORT}", title="Setup", ) ) SECRET_PATTERNS = [ r"sk-[A-Za-z0-9_\-]{20,}", r"hf_[A-Za-z0-9_\-]{20,}", r"github_pat_[A-Za-z0-9_]{20,}", r"ghp_[A-Za-z0-9]{20,}", r"xox[baprs]-[A-Za-z0-9\-]{20,}", r"AKIA[0-9A-Z]{16}", r"(?i:(api[_-]?key|secret|token|password)\s*[:=]\s*['\"]?[^'\"\s]{8,})", ] SECRET_RE = re.compile("|".join(f"(?:{pattern})" for pattern in SECRET_PATTERNS)) TOKEN_RE = re.compile(r"[A-Za-z_][A-Za-z_0-9]{1,}|[./\\-]{2,}|[{}()\[\]:=]+") def safe_json_dumps(obj, max_chars=None): try: text = json.dumps(obj, ensure_ascii=False, indent=2, default=str) except Exception: text = str(obj) if max_chars is not None and len(text) > max_chars: return text[:max_chars] + "\n... [truncated]" return text def is_missing_scalar(value): if value is None: return True if isinstance(value, (list, dict, tuple, set)): return False try: return bool(pd.isna(value)) except Exception: return False def clean_for_json(value): if is_missing_scalar(value): return None if isinstance(value, dict): return {str(k): clean_for_json(v) for k, v in value.items()} if isinstance(value, list): return [clean_for_json(v) for v in value] if isinstance(value, tuple): return [clean_for_json(v) for v in value] if np is not None: if isinstance(value, np.integer): return int(value) if isinstance(value, np.floating): if math.isnan(float(value)): return None return float(value) if isinstance(value, np.ndarray): return value.tolist() return value def redact_possible_secrets(text): if text is None: return "" text = str(text) return SECRET_RE.sub("[REDACTED_POSSIBLE_SECRET]", text) def contains_possible_secret(text): if text is None: return False return bool(SECRET_RE.search(str(text))) def preview_text(text, max_chars=MAX_PREVIEW_CHARS): text = redact_possible_secrets(text) text = re.sub(r"\s+", " ", text).strip() if len(text) > max_chars: return text[:max_chars] + " ... [truncated]" return text
We begin by setting up the Colab environment with only the lightweight packages needed for this workflow. We define the dataset path, output directory, random seed, preview limits, and export options so the tutorial behaves consistently. We also create the first set of helper functions for safe JSON formatting, secret redaction, missing-value handling, and clean text previews.
Building Parsing Utilities for Tool Calls and Text Outputs
Copy CodeCopiedUse a different Browser
def maybe_parse_json_string(value): if isinstance(value, str): stripped = value.strip() if (stripped.startswith("{") and stripped.endswith("}")) or ( stripped.startswith("[") and stripped.endswith("]") ): try: return json.loads(stripped) except Exception: return value return value def normalize_output_obj(value): return maybe_parse_json_string(value) def extract_tool_name(output): output = normalize_output_obj(output) if isinstance(output, dict): direct_keys = [ "name", "tool_name", "tool", "function", "command_name", "recipient_name", "toolName", "callee", ] for key in direct_keys: value = output.get(key) if isinstance(value, str) and value.strip(): return value.strip() nested_keys = [ "tool_call", "toolCall", "function_call", "call", "action", ] for nested_key in nested_keys: nested = output.get(nested_key) if isinstance(nested, dict): found = extract_tool_name(nested) if found: return found output_type = output.get("type") if isinstance(output_type, str): output_type = output_type.strip() if output_type and output_type.lower() not in {"tool_use", "text", "message"}: return output_type return "" def extract_tool_args(output): output = normalize_output_obj(output) if isinstance(output, dict): direct_arg_keys = [ "input", "args", "arguments", "parameters", "kwargs", "json", "payload", ] for key in direct_arg_keys: if key in output: return output[key] nested_keys = [ "tool_call", "toolCall", "function_call", "call", "action", ] for nested_key in nested_keys: nested = output.get(nested_key) if isinstance(nested, dict): args = extract_tool_args(nested) if args not in [None, "", {}]: return args ignored = { "name", "tool_name", "tool", "function", "command_name", "recipient_name", "toolName", "callee", "type", } return {key: value for key, value in output.items() if key not in ignored} return {} def extract_text_payload(output): output = normalize_output_obj(output) if isinstance(output, str): return output if isinstance(output, dict): text_keys = [ "text", "content", "message", "output", "value", "result", ] for key in text_keys: value = output.get(key) if isinstance(value, str): return value if isinstance(value, list): return safe_json_dumps(value) if isinstance(value, dict): nested = extract_text_payload(value) if nested: return nested return safe_json_dumps(output) return str(output) def robust_len(value): if value is None: return 0 return len(str(value)) def source_root(source_file): source_file = str(source_file or "").replace("\\", "/") if not source_file: return "unknown" parts = [part for part in source_file.split("/") if part] for marker in ["projects", "AIArchives", "archives", "claude"]: if marker in parts: idx = parts.index(marker) if idx + 1 = 2: return parts[-2] if parts: return parts[0] return "unknown" def write_jsonl(path, records): path = Path(path) with path.open("w", encoding="utf-8") as file: for record in records: file.write(json.dumps(clean_for_json(record), ensure_ascii=False, default=str) + "\n") def save_plot(path): path = Path(path) plt.tight_layout() plt.savefig(path, dpi=160, bbox_inches="tight") plt.show() plt.close() return path def print_basic_table(title, rows, columns=("Metric", "Value")): table = Table(title=title) for column in columns: table.add_column(str(column)) for row in rows: table.add_row(*[str(item) for item in row]) rprint(table) def tokenize(text, max_chars=12000): text = str(text or "")[:max_chars].lower() return TOKEN_RE.findall(text) def load_jsonl_manual(path, max_rows=None): records = [] bad_lines = [] with open(path, "r", encoding="utf-8") as file: for line_number, line in tqdm(enumerate(file, start=1), desc="Reading JSONL"): line = line.strip() if not line: continue try: records.append(json.loads(line)) except Exception as error: bad_lines.append( { "line_number": line_number, "error": repr(error), "preview": line[:500], } ) if max_rows is not None and len(records) >= max_rows: break return records, bad_lines
We build the core parsing utilities that turn raw output fields into usable tool names, tool arguments, and text payloads. We also define helpers for measuring text length, identifying source roots, writing JSONL files, saving plots, and printing clean tables. We finish this snippet by adding tokenization and manual JSONL loading to avoid fragile dataset-loading dependencies.
Inspecting the Hugging Face Repository and Loading JSONL Traces
Copy CodeCopiedUse a different Browser
rprint(Panel.fit("[bold]Inspecting Hugging Face dataset repository[/bold]")) api = HfApi() files = api.list_repo_files(repo_id=DATASET_ID, repo_type="dataset") pi_trace_files = [ file for file in files if file.startswith("pi-traces/") and file.endswith(".jsonl") ] file_summary = { "total_repo_files": len(files), "jsonl_files": sum(file.endswith(".jsonl") for file in files), "pi_trace_files": len(pi_trace_files), "claude_files": sum(file.startswith("claude/") for file in files), "has_flat_jsonl": FLAT_JSONL_FILENAME in files, } print_basic_table( "Repository File Summary", [(key, value) for key, value in file_summary.items()], ) rprint("[bold]Sample repository files:[/bold]") for file in files[:20]: print(" -", file) rprint(Panel.fit("[bold]Manual raw pi-trace preview[/bold]")) pi_examples = [] if pi_trace_files: for trace_file in pi_trace_files[:N_AGENT_TRACE_PREVIEWS]: try: local_trace_path = hf_hub_download( repo_id=DATASET_ID, repo_type="dataset", filename=trace_file, ) trace_records, trace_bad_lines = load_jsonl_manual(local_trace_path, max_rows=1) if trace_records: example = trace_records[0] pi_examples.append(example) preview_payload = { "trace_file": trace_file, "keys": list(example.keys()), "preview": example, } rprint( Panel( safe_json_dumps(preview_payload, max_chars=3000), title=f"Raw pi-trace preview: {trace_file}", ) ) if trace_bad_lines: rprint( f"[yellow]Bad JSONL lines in {trace_file}: {len(trace_bad_lines)}[/yellow]" ) except Exception as error: rprint(f"[yellow]Could not preview {trace_file}[/yellow]") rprint(repr(error)) else: rprint("[yellow]No pi-traces JSONL files found.[/yellow]") rprint(Panel.fit("[bold]Downloading flat merged JSONL from Hugging Face Hub[/bold]")) flat_path = hf_hub_download( repo_id=DATASET_ID, repo_type="dataset", filename=FLAT_JSONL_FILENAME, ) rprint(f"[green]Downloaded flat file:[/green] {flat_path}") rprint(Panel.fit("[bold]Loading flat JSONL manually[/bold]")) records, bad_lines = load_jsonl_manual(flat_path, max_rows=MAX_ROWS_TO_LOAD) if bad_lines: bad_lines_path = OUT_DIR / "bad_jsonl_lines.json" with open(bad_lines_path, "w", encoding="utf-8") as file: json.dump(bad_lines, file, ensure_ascii=False, indent=2) rprint(f"[yellow]Bad JSONL lines found: {len(bad_lines)} -> {bad_lines_path}[/yellow]") df = pd.DataFrame.from_records(records) rprint(f"[green]Loaded rows:[/green] {len(df):,}") rprint(f"[green]DataFrame shape:[/green] {df.shape}") rprint("[bold]Columns:[/bold]") print(list(df.columns)) display(df.head(3)) expected_cols = [ "uid", "source_file", "session", "model", "context", "cot", "output_type", "output", "completion", "origin", ] for column in expected_cols: if column not in df.columns: df[column] = None df["output_norm"] = df["output"].map(normalize_output_obj) df["tool_name"] = df["output_norm"].map(extract_tool_name) df["tool_args"] = df["output_norm"].map(extract_tool_args) df["text_p
[truncated for AI cost control]