Skip to content

Some quality-of-life enhancements to toc.py CLI script #329

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
216 changes: 190 additions & 26 deletions tutorial/toc.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,72 +2,178 @@
"""CLI script to build a table of contents for an IPython notebook"""

import argparse as ap
import logging
import pathlib
import re
from collections import namedtuple
import sys
from typing import NamedTuple

import nbformat
from nbformat import NotebookNode

TocEntry = namedtuple("TocEntry", ["level", "text", "anchor"])
__version__ = "0.1.2"

# Set up logging
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
logger = logging.getLogger("toc")


class TocEntry(NamedTuple):
"""Table of contents entry"""

level: int
text: str
anchor: str


def extract_markdown_cells(notebook: NotebookNode) -> str:
"""Extract the markdown cells from a notebook"""
"""Extract the markdown cells from a notebook

Args:
notebook: A notebook object

Returns:
str: Concatenated content of all markdown cells
"""
return "\n".join(
[cell.source for cell in notebook.cells if cell.cell_type == "markdown"]
)


def extract_toc(notebook: str) -> list[TocEntry]:
"""Extract the table of contents from a markdown string"""
def extract_toc(notebook: str, toc_header: str) -> list[TocEntry]:
"""Extract the table of contents from a markdown string

Parses markdown headings (lines starting with #) and converts them to TOC entries.
Each entry includes the heading level, text, and an anchor derived from the text.
Ignores '#' symbols inside code blocks.

Args:
notebook: String containing markdown content
toc_header: Header text for the table of contents

Returns:
list[TocEntry]: List of table of contents entries
"""
toc = []
line_re = re.compile(r"(#+)\s+(.+)")
for line in notebook.splitlines():
is_code_block = False

for line_num, line in enumerate(notebook.splitlines(), start=1):
# Skip line if contains exactly the toc header
if line.strip() == toc_header:
continue

# Check if we're entering or exiting a code block
if line.strip().startswith("```"):
is_code_block = not is_code_block
continue

# Skip header processing if we're in a code block
if is_code_block:
continue

# Process headers
if groups := re.match(line_re, line):
heading, text, *_ = groups.groups()
level = len(heading)
anchor = "-".join(text.replace("`", "").split())
toc.append(TocEntry(level, text, anchor))
try:
heading, text, *_ = groups.groups()
level = len(heading)

# Clean the text to make a proper anchor
clean_text = text.replace("`", "")
# Remove any other special characters that might break anchors
clean_text = re.sub(r"[^\w\s-]", "", clean_text)
anchor = "-".join(clean_text.lower().split())

toc.append(TocEntry(level, text, anchor))
logger.debug("Found heading (level %d): %s", level, text)

except Exception as e:
logger.warning("Error processing heading at line %d: %s", line_num, e)

return toc


def markdown_toc(toc: list[TocEntry]) -> str:
"""Build a string representation of the toc as a nested markdown list"""
"""Build a string representation of the toc as a nested markdown list

Args:
toc: List of TocEntry objects

Returns:
str: Markdown-formatted table of contents with proper indentation
"""
lines = []
for entry in toc:
line = f"{' ' * entry.level}- [{entry.text}](#{entry.anchor})"
lines.append(line)
return "\n".join(lines)


def build_toc(nb_path: pathlib.Path, placeholder: str = "[TOC]") -> NotebookNode:
"""Build a table of contents for a notebook and insert it at the location of a placeholder"""
def build_toc(
nb_path: pathlib.Path,
placeholder: str = "[TOC]",
toc_header: str = "# Table of Contents",
) -> tuple[NotebookNode, bool]:
"""Build a table of contents for a notebook and insert it at the location of a placeholder

Args:
nb_path: Path to the notebook file
placeholder: The text to replace with the generated TOC (default: "[TOC]")
toc_header: The header text to use for the TOC (default: "# Table of Contents")

Returns:
tuple[NotebookNode, bool]: The notebook with TOC inserted and a boolean indicating if placeholder was found
"""
# Read the notebook
nb_obj: NotebookNode = nbformat.read(nb_path, nbformat.NO_CONVERT)
try:
nb_obj: NotebookNode = nbformat.read(nb_path, nbformat.NO_CONVERT)
except Exception:
logger.exception("Failed to read notebook '%s'", nb_path)
raise

md_cells = extract_markdown_cells(nb_obj)

# Build tree
toc_tree = extract_toc(md_cells)
toc_tree = extract_toc(md_cells, toc_header)

if not toc_tree:
logger.warning("No headings found in notebook '%s'", nb_path)

# Build toc representation
toc_repr = markdown_toc(toc_tree)

# Insert it a the location of a placeholder
toc_header = "# Table of Contents"
# Insert it at the location of a placeholder
toc_replaced = False

for cell in nb_obj.cells:
if cell.source.startswith((placeholder, toc_header)):
cell.source = f"{toc_header}\n{toc_repr}"
cell.cell_type = "markdown"
toc_replaced = True
break

return nb_obj
if not toc_replaced:
logger.warning(
"Placeholder '%s' or heading '%s' not found in notebook",
placeholder,
toc_header,
)

return nb_obj, toc_replaced


def main():
"""CLI entry point"""
parser = ap.ArgumentParser(
description="Build a table of contents for an IPython notebook"
description="Build a table of contents for an IPython notebook",
epilog="""
This script extracts headings from markdown cells in a Jupyter notebook and
generates a markdown-formatted table of contents. The TOC is inserted into
the notebook at the location of a placeholder (default: '[TOC]') or where
a '# Table of Contents' heading exists. Links in the TOC point to notebook
anchors created from the heading text.
""",
formatter_class=ap.RawDescriptionHelpFormatter,
)
parser.add_argument("notebook", type=str, help="Path to the notebook to process")
parser.add_argument(
Expand All @@ -80,22 +186,80 @@ def main():
default=False,
help="Force overwrite of original notebook",
)
parser.add_argument(
"--placeholder",
"-p",
type=str,
default="[TOC]",
help="Placeholder text to replace with the TOC (default: '[TOC]')",
)
parser.add_argument(
"--header",
type=str,
default="# Table of Contents",
help="Header text for the TOC (default: '# Table of Contents')",
)
parser.add_argument(
"--verbose", "-v", action="store_true", help="Enable verbose output"
)
parser.add_argument(
"--version", action="version", version=f"%(prog)s {__version__}"
)
args = parser.parse_args()

if not (input_nb := pathlib.Path(args.notebook)).exists():
raise FileNotFoundError(input_nb)
# Set logging level based on verbosity
if args.verbose:
logger.setLevel(logging.DEBUG)

# Validate input file
try:
input_nb = pathlib.Path(args.notebook)
if not input_nb.exists():
logger.error("Input file not found: %s", input_nb)
sys.exit(1)
if not input_nb.is_file():
logger.error("Input path is not a file: %s", input_nb)
sys.exit(1)
except Exception:
logger.exception("Error processing input path")
sys.exit(1)

# Set output file path
if args.output is None:
output_nb = input_nb.with_suffix(".toc.ipynb")
else:
output_nb = pathlib.Path(args.output)

with output_nb.open("w", encoding="utf-8") as file:
nbformat.write(build_toc(input_nb), file)
# Create output directory if it doesn't exist
output_nb.parent.mkdir(parents=True, exist_ok=True)

try:
# Generate TOC and write to output file
logger.info("Processing notebook: %s", input_nb)
toc_notebook, toc_replaced = build_toc(input_nb, args.placeholder, args.header)

if not toc_replaced:
logger.warning("Skipping output - no placeholder found in notebook")
sys.exit(0) # Exit with success code since it's not an error

if not args.force:
logger.debug("Ignoring output file: %s", output_nb)

with output_nb.open("w", encoding="utf-8") as file:
nbformat.write(toc_notebook, file)

logger.info("TOC written to: %s", output_nb)
else:
logger.info("Replacing original notebook with TOC version")

with input_nb.open("w", encoding="utf-8") as file:
nbformat.write(toc_notebook, file)

logger.info("Original notebook replaced with: %s", input_nb)

if args.force:
input_nb.unlink()
output_nb.rename(input_nb)
except Exception:
logger.exception("Error processing notebook")
sys.exit(1)


if __name__ == "__main__":
Expand Down