Files
axolotl/docs/scripts/generate_config_docs.py
Dan Saunders 79ddaebe9a Add ruff, remove black, isort, flake8, pylint (#3092)
* black, isort, flake8 -> ruff

* remove unused

* add back needed import

* fix
2025-08-23 23:37:33 -04:00

750 lines
29 KiB
Python

# type: ignore
"""
Quarto documentation generation from Pydantic models. Uses Pydantic model source code
to automatically group fields, including inherited fields from parent classes.
"""
import ast
import inspect
import textwrap
import types
import typing
from typing import Any, FrozenSet, Type, Union
from pydantic import BaseModel
from axolotl.utils.schemas.config import AxolotlInputConfig
class QuartoGenerator:
"""Generate Quarto documentation from Pydantic models."""
def __init__(self):
self._class_fields_cache = {}
self._inheritance_map_cache = {}
self._nested_models_cache = {}
def _get_direct_fields(self, cls: Type[BaseModel]) -> FrozenSet[str]:
"""Get fields defined directly in a single class (not inherited)."""
if cls in self._class_fields_cache:
return self._class_fields_cache[cls]
fields = set()
# Get annotated fields
if hasattr(cls, "__annotations__"):
fields.update(cls.__annotations__.keys())
# Filter out private/special methods
fields = {f for f in fields if not f.startswith("_")}
result = frozenset(fields)
self._class_fields_cache[cls] = result
return result
def _is_pydantic_model(self, type_obj) -> bool:
"""Check if a type is a Pydantic BaseModel."""
return inspect.isclass(type_obj) and issubclass(type_obj, BaseModel)
def _extract_nested_type(self, field_type) -> Any:
"""Extract the actual type from complex type annotations."""
# Handle Annotated types (Python 3.9+)
if hasattr(typing, "get_origin") and hasattr(typing, "get_args"):
origin = typing.get_origin(field_type)
args = typing.get_args(field_type)
if origin is not None:
# Handle Annotated[SomeType, ...] - extract the first argument
if hasattr(typing, "Annotated") and origin is typing.Annotated:
if args:
return self._extract_nested_type(
args[0]
) # Recursively process the actual type
# Handle list[SomeType], List[SomeType], etc.
elif origin in (list, typing.List):
if args:
return self._extract_nested_type(
args[0]
) # Extract element type
# Handle Union types (including | syntax)
elif origin is typing.Union:
# Get non-None types from the Union
non_none_types = [arg for arg in args if arg is not type(None)]
if len(non_none_types) >= 1:
# Prioritize Pydantic models over primitive types
pydantic_models = [
arg
for arg in non_none_types
if self._is_pydantic_model(arg)
]
if pydantic_models:
# Return the first Pydantic model found
return self._extract_nested_type(pydantic_models[0])
# No Pydantic models, return the first non-None type
return self._extract_nested_type(non_none_types[0])
# Handle new Python 3.10+ union syntax (PeftConfig | None)
if hasattr(field_type, "__class__") and field_type.__class__ is types.UnionType:
# Get non-None types from the Union
non_none_types = [
arg for arg in field_type.__args__ if arg is not type(None)
]
if len(non_none_types) >= 1:
# Prioritize Pydantic models over primitive types
pydantic_models = [
arg for arg in non_none_types if self._is_pydantic_model(arg)
]
if pydantic_models:
return self._extract_nested_type(pydantic_models[0])
return self._extract_nested_type(non_none_types[0])
# Handle old typing.Union syntax (fallback)
if hasattr(field_type, "__origin__"):
if field_type.__origin__ is Union:
# Get non-None types from the Union
non_none_types = [
arg for arg in field_type.__args__ if arg is not type(None)
]
if len(non_none_types) >= 1:
# Prioritize Pydantic models over primitive types
pydantic_models = [
arg for arg in non_none_types if self._is_pydantic_model(arg)
]
if pydantic_models:
return self._extract_nested_type(pydantic_models[0])
return self._extract_nested_type(non_none_types[0])
# Handle other generic types like dict[str, Any], etc.
elif hasattr(field_type, "__args__"):
return field_type
return field_type
def _extract_all_pydantic_models_from_type(
self, field_type
) -> list[type[BaseModel]]:
"""Extract all Pydantic models from a type annotation, including from Unions."""
models = []
if field_type is None:
return models
# Handle Annotated types
if hasattr(typing, "get_origin") and hasattr(typing, "get_args"):
origin = typing.get_origin(field_type)
args = typing.get_args(field_type)
if origin is not None:
# Handle Annotated[SomeType, ...] - extract from the first argument
if hasattr(typing, "Annotated") and origin is typing.Annotated:
if args:
models.extend(
self._extract_all_pydantic_models_from_type(args[0])
)
return models
# Handle list[SomeType], List[SomeType], etc.
if origin in (list, typing.List):
if args:
models.extend(
self._extract_all_pydantic_models_from_type(args[0])
)
return models
# Handle Union types
if origin is typing.Union:
for arg in args:
if arg is not type(None): # Skip None type
models.extend(
self._extract_all_pydantic_models_from_type(arg)
)
return models
# Handle new Python 3.10+ union syntax
if hasattr(field_type, "__class__") and field_type.__class__ is types.UnionType:
for arg in field_type.__args__:
if arg is not type(None): # Skip None type
models.extend(self._extract_all_pydantic_models_from_type(arg))
return models
# Handle old typing.Union syntax (fallback)
if hasattr(field_type, "__origin__") and field_type.__origin__ is Union:
for arg in field_type.__args__:
if arg is not type(None): # Skip None type
models.extend(self._extract_all_pydantic_models_from_type(arg))
return models
# Check if this type itself is a Pydantic model
if self._is_pydantic_model(field_type):
models.append(field_type)
return models
def _get_nested_models(
self, model_class: type[BaseModel], visited=None
) -> dict[str, type[BaseModel]]:
"""Get all nested Pydantic models from a model class."""
if visited is None:
visited = set()
# Avoid infinite recursion
if model_class in visited:
return {}
if model_class in self._nested_models_cache:
return self._nested_models_cache[model_class]
visited.add(model_class)
nested_models = {}
# Check all fields in the model
for field_info in model_class.model_fields.values():
field_type = self._extract_nested_type(field_info.annotation)
if self._is_pydantic_model(field_type):
nested_models[field_type.__name__] = field_type
# Recursively get nested models from this nested model
deeper_nested = self._get_nested_models(field_type, visited.copy())
nested_models.update(deeper_nested)
self._nested_models_cache[model_class] = nested_models
return nested_models
def _build_inheritance_map(self, child_class: Type[BaseModel]):
"""Build inheritance map for a class and all its parents."""
if child_class in self._inheritance_map_cache:
return self._inheritance_map_cache[child_class]
inheritance_map = {}
# Get MRO and filter out BaseModel and object
mro_classes = [
cls
for cls in child_class.__mro__
if cls not in (BaseModel, object) and hasattr(cls, "__annotations__")
]
# Process each class in the MRO
for cls in mro_classes:
inheritance_map[cls] = self._get_direct_fields(cls)
self._inheritance_map_cache[child_class] = inheritance_map
return inheritance_map
def _wrap_comment(self, text: str, width: int = 88) -> list[str]:
"""Wrap a comment to specified width, accounting for '# ' prefix."""
if not text.strip():
return ["#"]
# Account for "# " prefix (2 characters)
content_width = width - 2
wrapped_lines = textwrap.wrap(text, width=content_width)
return [f"# {line}" for line in wrapped_lines]
def _extract_type_from_source(
self, model_class: type[BaseModel], field_name: str
) -> str:
"""Extract the actual type annotation text from source code, checking inheritance chain."""
# Use inheritance map to check classes efficiently
inheritance_map = self._build_inheritance_map(model_class)
# Check classes in MRO order
for cls in model_class.__mro__:
if cls in inheritance_map and field_name in inheritance_map[cls]:
type_annotation = self._get_type_from_class_source(cls, field_name)
if type_annotation != "unknown":
return type_annotation
return "unknown"
def _get_type_from_class_source(self, class_obj: type, field_name: str) -> str:
"""Extract type annotation from a specific class's source code."""
try:
source = inspect.getsource(class_obj)
tree = ast.parse(source)
except (OSError, TypeError):
return "unknown"
# Find the class definition
for node in tree.body:
if isinstance(node, ast.ClassDef) and node.name == class_obj.__name__:
# Find the field assignment
for body_node in node.body:
if isinstance(body_node, ast.AnnAssign) and isinstance(
body_node.target, ast.Name
):
if body_node.target.id == field_name and body_node.annotation:
return ast.unparse(body_node.annotation)
break
return "unknown"
def _extract_field_groups_from_all_classes(
self, model_class: type[BaseModel]
) -> list[dict]:
"""Extract field groups from all classes in the inheritance hierarchy."""
all_groups = []
inheritance_map = self._build_inheritance_map(model_class)
# Get all Pydantic base classes in MRO order (most specific first)
# This puts AxolotlInputConfig fields first, then parent class fields
pydantic_classes = [
cls
for cls in model_class.__mro__
if cls in inheritance_map and inheritance_map[cls]
]
# Extract groups from each class
for cls in pydantic_classes:
class_groups = self._extract_field_groups_from_source(cls)
for group in class_groups:
all_groups.append(group)
# If no groups found, create a default grouping by class
if not all_groups:
for cls in pydantic_classes:
fields_in_class = inheritance_map[cls]
if fields_in_class:
all_groups.append(
{
"fields": list(fields_in_class),
}
)
return all_groups
def _extract_field_groups_from_source(
self, model_class: type[BaseModel]
) -> list[dict]:
"""Extract field groups from source code based on blank lines and comments."""
try:
source = inspect.getsource(model_class)
tree = ast.parse(source)
except (OSError, TypeError):
# Fallback if we can't get source code
fields_in_class = self._get_direct_fields(model_class)
if fields_in_class:
return [
{
"fields": list(fields_in_class),
}
]
return []
groups = []
current_group_fields = []
current_group_comment = None
# Find the class definition
class_node = None
for node in ast.walk(tree):
if isinstance(node, ast.ClassDef) and node.name == model_class.__name__:
class_node = node
break
if not class_node:
fields_in_class = self._get_direct_fields(model_class)
if fields_in_class:
return [
{
"fields": list(fields_in_class),
}
]
return []
# Parse the source lines to detect groupings
source_lines = source.split("\n")
# Get fields that are actually defined in this specific class
fields_in_class = self._get_direct_fields(model_class)
# Find assignments that correspond to model fields for THIS class only
field_assignments = []
for node in class_node.body:
if isinstance(node, ast.AnnAssign) and isinstance(node.target, ast.Name):
field_name = node.target.id
if field_name in fields_in_class:
field_assignments.append(
{
"name": field_name,
"lineno": node.lineno,
"end_lineno": getattr(node, "end_lineno", node.lineno),
}
)
if not field_assignments:
if fields_in_class:
return [
{
"fields": list(fields_in_class),
}
]
return []
# Sort by line number
field_assignments.sort(key=lambda x: x["lineno"])
# Group fields based on blank lines and comments
for i, field_info in enumerate(field_assignments):
field_name = field_info["name"]
current_line = field_info["lineno"]
# Check if this starts a new group (blank line before or significant gap)
is_new_group = False
if i == 0:
is_new_group = True
else:
prev_end_line = field_assignments[i - 1]["end_lineno"]
# Check for blank lines or comments between fields
lines_between = source_lines[prev_end_line : current_line - 1]
has_blank_line = any(line.strip() == "" for line in lines_between)
has_comment = any(
line.strip().startswith("#") for line in lines_between
)
# Start new group if there's a blank line or comment, or significant gap
if has_blank_line or has_comment or (current_line - prev_end_line > 3):
is_new_group = True
if is_new_group and current_group_fields:
# Save the previous group
groups.append(
{
"fields": current_group_fields.copy(),
"description": current_group_comment,
}
)
current_group_fields = []
current_group_comment = None
current_group_fields.append(field_name)
# Add the final group
if current_group_fields:
groups.append(
{
"fields": current_group_fields,
"description": current_group_comment,
}
)
return groups
def _generate_field_documentation(
self,
model_class: type[BaseModel],
field_name: str,
field_info: dict,
field_type_str: str,
is_required: bool,
indent_level: int = 0,
visited_models: set = None,
) -> list[str]:
"""Generate documentation for a single field, expanding nested models inline."""
if visited_models is None:
visited_models = set()
lines = []
indent = " " * indent_level
# Get the actual field type for nested model detection
if field_name in model_class.model_fields:
pydantic_field_info = model_class.model_fields[field_name]
actual_field_type = pydantic_field_info.annotation
else:
actual_field_type = None
# Add description comment if available
description = field_info.get("description", "")
if description:
wrapped_lines = self._wrap_comment(description, width=88 - len(indent))
for line in wrapped_lines:
lines.append(f"{indent}{line}")
# Extract nested Pydantic models from the type annotation
nested_models = self._extract_all_pydantic_models_from_type(actual_field_type)
# Filter out already visited models to prevent infinite recursion
expandable_models = [
model for model in nested_models if model not in visited_models
]
if expandable_models:
# This field contains Pydantic models that can be expanded
# Show the field with its full type annotation
field_line = f"{indent}{field_name}: {field_type_str}"
if field_info.get("default") is not None:
field_line += f" = {field_info['default']}"
if is_required:
field_line += " (required)"
lines.append(field_line)
# Add to visited to prevent infinite recursion
new_visited = visited_models.copy()
new_visited.update(expandable_models)
# Expand each nested Pydantic model
for i, nested_model in enumerate(expandable_models):
if i > 0:
lines.append("\n")
lines.append(f"{indent} # For {nested_model.__name__}:")
# Get nested model schema
try:
nested_schema = nested_model.model_json_schema()
nested_properties = nested_schema.get("properties", {})
nested_required = nested_schema.get("required", [])
except Exception:
# Fallback: use model fields directly
nested_properties = {}
nested_required = []
for (
nested_field_name,
nested_field_info,
) in nested_model.model_fields.items():
nested_description = ""
if (
hasattr(nested_field_info, "json_schema_extra")
and nested_field_info.json_schema_extra
):
nested_description = (
nested_field_info.json_schema_extra.get(
"description", ""
)
)
elif (
hasattr(nested_field_info, "description")
and nested_field_info.description
):
nested_description = nested_field_info.description
nested_default_val = None
if (
hasattr(nested_field_info, "default")
and nested_field_info.default is not None
):
if str(nested_field_info.default) != "PydanticUndefined":
nested_default_val = nested_field_info.default
nested_properties[nested_field_name] = {
"type": "unknown",
"description": nested_description,
"default": nested_default_val,
}
if nested_field_info.is_required():
nested_required.append(nested_field_name)
# Get field groups for the nested model
nested_field_groups = self._extract_field_groups_from_all_classes(
nested_model
)
# Generate nested fields with increased indentation
for i, group in enumerate(nested_field_groups):
if not group["fields"]:
continue
# Add blank line between groups (except before first group)
if i > 0:
lines.append("")
# Process nested fields
for nested_field_name in group["fields"]:
if nested_field_name not in nested_properties:
continue
nested_field_info = nested_properties[nested_field_name]
nested_field_type = self._extract_type_from_source(
nested_model, nested_field_name
)
nested_is_required = nested_field_name in nested_required
# Recursively generate documentation for nested field
nested_lines = self._generate_field_documentation(
nested_model,
nested_field_name,
nested_field_info,
nested_field_type,
nested_is_required,
indent_level + 1,
new_visited,
)
lines.extend(nested_lines)
else:
# Regular field (no expandable nested models)
field_line = f"{indent}{field_name}: {field_type_str}"
if field_info.get("default") is not None:
field_line += f" = {field_info['default']}"
if is_required:
field_line += " (required)"
lines.append(field_line)
return lines
def generate_qmd(
self,
model_class: type[BaseModel],
title: str | None = None,
expand_nested: bool = True,
) -> str:
"""Auto-generate config reference documentation including inherited fields."""
if title is None:
title = f"{model_class.__name__} Reference"
# Try to get JSON schema, with fallback for serialization issues
try:
schema = model_class.model_json_schema()
properties = schema.get("properties", {})
required = schema.get("required", [])
except Exception as e:
print(
f"Warning: Could not generate JSON schema ({e}). Using model fields instead."
)
# Fallback: use model fields directly
properties = {}
required = []
for field_name, field_info in model_class.model_fields.items():
# Extract description from json_schema_extra or field info
description = ""
if (
hasattr(field_info, "json_schema_extra")
and field_info.json_schema_extra
):
description = field_info.json_schema_extra.get("description", "")
elif hasattr(field_info, "description") and field_info.description:
description = field_info.description
# Get default value
default_val = None
if hasattr(field_info, "default") and field_info.default is not None:
# Handle special Pydantic default markers
if str(field_info.default) != "PydanticUndefined":
default_val = field_info.default
properties[field_name] = {
"type": "unknown",
"description": description,
"default": default_val,
}
if field_info.is_required():
required.append(field_name)
# Extract field groups from all classes in inheritance hierarchy
field_groups = self._extract_field_groups_from_all_classes(model_class)
# Start building QMD content
qmd_lines = [
"---",
f"title: {title}",
"description: A complete list of all configuration options.",
"---",
"",
]
# Generate one big code block with all fields (inline nested expansion)
qmd_lines.append("```yaml")
for i, group in enumerate(field_groups):
if not group["fields"]:
continue
# Add blank line between groups (except before first group)
if i > 0:
qmd_lines.append("")
# Process fields in the order they appear in source
for field_name in group["fields"]:
if field_name not in properties:
continue
field_info = properties[field_name]
field_type = self._extract_type_from_source(model_class, field_name)
is_required = field_name in required
if expand_nested:
# Check if this field has nested models
if field_name in model_class.model_fields:
pydantic_field_info = model_class.model_fields[field_name]
nested_models = self._extract_all_pydantic_models_from_type(
pydantic_field_info.annotation
)
has_nested = bool(nested_models)
else:
has_nested = False
# Add blank line before nested config
if has_nested:
qmd_lines.append("")
# Use the new inline generation method
field_lines = self._generate_field_documentation(
model_class,
field_name,
field_info,
field_type,
is_required,
indent_level=0,
visited_models=set(),
)
qmd_lines.extend(field_lines)
# Add blank line after nested config
if has_nested:
qmd_lines.append("")
else:
# Original simple approach
description = field_info.get("description", "")
default = field_info.get("default")
# Add wrapped comment for description
if description:
wrapped_lines = self._wrap_comment(description)
qmd_lines.extend(wrapped_lines)
line = f"{field_name}: {field_type}"
if default is not None:
line += f" = {default}"
if is_required:
line += " (required)"
qmd_lines.append(line)
qmd_lines.append("```")
# Join all lines and clean up any double newlines
content = "\n".join(qmd_lines)
# Replace multiple consecutive newlines with just two newlines (one blank line)
import re
content = re.sub(r"\n{3,}", "\n\n", content)
# Ensure single newline at the very end
content = content.rstrip("\n") + "\n"
return content
def main():
generator = QuartoGenerator()
print("Generating config reference content...")
qmd_content = generator.generate_qmd(AxolotlInputConfig, "Config Reference", True)
print("Writing to file...")
with open("docs/config-reference.qmd", "w", encoding="utf-8") as f:
f.write(qmd_content)
print("Done!")
if __name__ == "__main__":
main()