Infrastructure

Language model backend, dataset, and database configuration.

LanguageModelConfig `pydantic-model`

Bases: BaseModelConfig

Fields:

device (str)
dtype (dtype)
model_name (str)
model_from_pretrained_path (str | None)
use_flash_attn (bool)
cache_dir (str | None)
local_files_only (bool)
max_length (int)
backend (Literal['huggingface', 'transformer_lens', 'auto'])
load_ckpt (bool)
tokenizer_only (bool)
prepend_bos (bool)
bos_token_id (int | None)
eos_token_id (int | None)
pad_token_id (int | None)

model_name `pydantic-field`

model_name: str = 'gpt2'

The name of the model to use.

model_from_pretrained_path `pydantic-field`

model_from_pretrained_path: str | None = None

The path to the pretrained model. If None, will use the model from HuggingFace.

use_flash_attn `pydantic-field`

use_flash_attn: bool = False

Whether to use Flash Attention.

cache_dir `pydantic-field`

cache_dir: str | None = None

The directory of the HuggingFace cache. Should have the same effect as HF_HOME.

local_files_only `pydantic-field`

local_files_only: bool = False

Whether to only load the model from the local files. Should have the same effect as HF_HUB_OFFLINE=1.

max_length `pydantic-field`

max_length: int = 2048

The maximum length of the input.

backend `pydantic-field`

backend: Literal[
    "huggingface", "transformer_lens", "auto"
] = "auto"

The backend to use for the language model.

tokenizer_only `pydantic-field`

tokenizer_only: bool = False

Whether to only load the tokenizer.

prepend_bos `pydantic-field`

prepend_bos: bool = True

Whether to prepend the BOS token to the input.

bos_token_id `pydantic-field`

bos_token_id: int | None = None

The ID of the BOS token. If None, will use the default BOS token.

eos_token_id `pydantic-field`

eos_token_id: int | None = None

The ID of the EOS token. If None, will use the default EOS token.

pad_token_id `pydantic-field`

pad_token_id: int | None = None

The ID of the padding token. If None, will use the default padding token.

from_pretrained_sae `staticmethod`

from_pretrained_sae(pretrained_name_or_path: str, **kwargs)

Load the LanguageModelConfig from a pretrained SAE name or path. Config is read from /lm_config.json (for local storage), //lm_config.json (for HuggingFace Hub), or constructed from model name (for SAELens).

Parameters:

Name	Type	Description	Default
`pretrained_name_or_path`	`str`	The path to the pretrained SAE.	required
`**kwargs`	`Any`	Additional keyword arguments to pass to the LanguageModelConfig constructor.	`{}`

Source code in src/lm_saes/backend/language_model.py

@staticmethod
def from_pretrained_sae(pretrained_name_or_path: str, **kwargs):
    """Load the LanguageModelConfig from a pretrained SAE name or path. Config is read from <pretrained_name_or_path>/lm_config.json (for local storage), <repo_id>/<name>/lm_config.json (for HuggingFace Hub), or constructed from model name (for SAELens).

    Args:
        pretrained_name_or_path (str): The path to the pretrained SAE.
        **kwargs (Any): Additional keyword arguments to pass to the LanguageModelConfig constructor.
    """
    sae_type = auto_infer_pretrained_sae_type(pretrained_name_or_path.split(":")[0])
    if sae_type == PretrainedSAEType.LOCAL:
        path = os.path.join(os.path.dirname(pretrained_name_or_path), "lm_config.json")
    elif sae_type == PretrainedSAEType.HUGGINGFACE:
        repo_id, name = pretrained_name_or_path.split(":")
        path = hf_hub_download(repo_id=repo_id, filename=f"{name}/lm_config.json")
    elif sae_type == PretrainedSAEType.SAELENS:
        from sae_lens.loading.pretrained_saes_directory import get_pretrained_saes_directory

        repo_id, name = pretrained_name_or_path.split(":")
        lookups = get_pretrained_saes_directory()
        assert lookups.get(repo_id) is not None and lookups[repo_id].saes_map.get(name) is not None, (
            f"Pretrained SAE {pretrained_name_or_path} not found in SAELens. This might indicate bugs in `auto_infer_pretrained_sae_type`."
        )
        model_name = lookups[repo_id].model
        return LanguageModelConfig(model_name=model_name, **kwargs)
    else:
        raise ValueError(f"Unsupported pretrained type: {sae_type}")
    with open(os.path.join(path, "lm_config.json"), "r") as f:
        lm_config = json.load(f)
    return LanguageModelConfig.model_validate(lm_config, **kwargs)

TransformerLensLanguageModel

TransformerLensLanguageModel(
    cfg: LanguageModelConfig,
    device_mesh: DeviceMesh | None = None,
)

Bases: LanguageModel

Source code in src/lm_saes/backend/language_model.py

def __init__(self, cfg: LanguageModelConfig, device_mesh: DeviceMesh | None = None):
    self.cfg = cfg
    self.device_mesh = device_mesh
    if cfg.device == "cuda":
        self.device = torch.device(f"cuda:{torch.cuda.current_device()}")
    elif cfg.device == "npu":
        self.device = torch.device(f"npu:{torch.npu.current_device()}")  # type: ignore[reportAttributeAccessIssue]
    else:
        self.device = torch.device(cfg.device)

    hf_model = (
        AutoModelForCausalLM.from_pretrained(
            (cfg.model_name if cfg.model_from_pretrained_path is None else cfg.model_from_pretrained_path),
            cache_dir=cfg.cache_dir,
            local_files_only=cfg.local_files_only,
            dtype=cfg.dtype,
            trust_remote_code=True,
        )
        if cfg.load_ckpt and not cfg.tokenizer_only
        else None
    )
    hf_tokenizer = AutoTokenizer.from_pretrained(
        (cfg.model_name if cfg.model_from_pretrained_path is None else cfg.model_from_pretrained_path),
        cache_dir=cfg.cache_dir,
        trust_remote_code=True,
        use_fast=True,
        add_bos_token=True,
        local_files_only=cfg.local_files_only,
    )
    self.tokenizer = set_tokens(
        hf_tokenizer,
        cfg.bos_token_id,
        cfg.eos_token_id,
        cfg.pad_token_id,
    )
    self.model = (
        HookedTransformer.from_pretrained_no_processing(
            cfg.model_name,
            use_flash_attn=cfg.use_flash_attn,
            device=self.device,
            cache_dir=cfg.cache_dir,
            hf_model=hf_model,
            hf_config=hf_model.config,
            tokenizer=hf_tokenizer,
            dtype=cfg.dtype,  # type: ignore ; issue with transformer_lens
        )
        if hf_model and not cfg.tokenizer_only
        else None
    )

run_with_cache_until

run_with_cache_until(*args, **kwargs) -> Any

Run with activation caching, stopping at a given hook for efficiency.

Source code in src/lm_saes/backend/language_model.py

def run_with_cache_until(self, *args, **kwargs) -> Any:
    """Run with activation caching, stopping at a given hook for efficiency."""
    assert self.model is not None, "model must be initialized"
    if self.device_mesh is None:
        return run_with_cache_until(self.model, *args, **kwargs)

    args = pytree.tree_map(self._to_tensor, args)
    kwargs = pytree.tree_map(self._to_tensor, kwargs)
    return pytree.tree_map(self._to_dtensor, run_with_cache_until(self.model, *args, **kwargs))

HuggingFaceLanguageModel

HuggingFaceLanguageModel(cfg: LanguageModelConfig)

Bases: LanguageModel

Source code in src/lm_saes/backend/language_model.py

def __init__(self, cfg: LanguageModelConfig):
    self.cfg = cfg
    if cfg.device == "cuda":
        self.device = torch.device(f"cuda:{torch.cuda.current_device()}")
    elif cfg.device == "npu":
        self.device = torch.device(f"npu:{torch.npu.current_device()}")  # type: ignore[reportAttributeAccessIssue]
    else:
        self.device = torch.device(cfg.device)

    self.model = AutoModelForCausalLM.from_pretrained(
        cfg.model_name if cfg.model_from_pretrained_path is None else cfg.model_from_pretrained_path,
        cache_dir=cfg.cache_dir,
        local_files_only=cfg.local_files_only,
        dtype=cfg.dtype,
        trust_remote_code=True,
    ).to(self.device)  # type: ignore

    self.tokenizer = AutoTokenizer.from_pretrained(
        cfg.model_name if cfg.model_from_pretrained_path is None else cfg.model_from_pretrained_path,
        cache_dir=cfg.cache_dir,
        trust_remote_code=True,
        use_fast=True,
        add_bos_token=True,
        local_files_only=cfg.local_files_only,
    )
    self.model.eval()

DatasetConfig `pydantic-model`

Bases: BaseConfig

Fields:

dataset_name_or_path (str)
cache_dir (str | None)
is_dataset_on_disk (bool)

dataset_name_or_path `pydantic-field`

dataset_name_or_path: str = 'openwebtext'

The name or path to the dataset. Should be a valid dataset name or path for datasets.load_dataset or datasets.load_from_disk, depending on is_dataset_on_disk.

cache_dir `pydantic-field`

cache_dir: str | None = None

The directory to cache the dataset. Will be passed to datasets.load_dataset.

is_dataset_on_disk `pydantic-field`

is_dataset_on_disk: bool = False

Whether the dataset is saved through datasets.save_to_disk. If True, the dataset will be loaded through datasets.load_from_disk. Otherwise, it will be loaded through datasets.load_dataset.

MongoDBConfig `pydantic-model`

Bases: BaseModel

Fields:

mongo_uri (str)
mongo_db (str)

MongoClient

MongoClient(cfg: MongoDBConfig)

Source code in src/lm_saes/database.py

def __init__(self, cfg: MongoDBConfig):
    self.client: pymongo.MongoClient = pymongo.MongoClient(cfg.mongo_uri)
    self.db = self.client[cfg.mongo_db]
    self.fs: gridfs.GridFS | None = None
    self.feature_collection = self.db["features"]
    self.sae_collection = self.db["saes"]
    self.analysis_collection = self.db["analyses"]
    self.dataset_collection = self.db["datasets"]
    self.model_collection = self.db["models"]
    self.bookmark_collection = self.db["bookmarks"]
    self.sae_set_collection = self.db["sae_sets"]
    self.circuit_collection = self.db["circuits"]
    self.sae_collection.create_index([("name", pymongo.ASCENDING), ("series", pymongo.ASCENDING)], unique=True)
    self.sae_collection.create_index([("series", pymongo.ASCENDING)])
    self.analysis_collection.create_index(
        [("name", pymongo.ASCENDING), ("sae_name", pymongo.ASCENDING), ("sae_series", pymongo.ASCENDING)],
        unique=True,
    )
    self.feature_collection.create_index(
        [("sae_name", pymongo.ASCENDING), ("sae_series", pymongo.ASCENDING), ("index", pymongo.ASCENDING)],
        unique=True,
    )
    self.dataset_collection.create_index([("name", pymongo.ASCENDING)], unique=True)
    self.model_collection.create_index([("name", pymongo.ASCENDING)], unique=True)
    self.sae_set_collection.create_index([("name", pymongo.ASCENDING)], unique=True)
    self.sae_set_collection.create_index([("sae_series", pymongo.ASCENDING)])
    self.bookmark_collection.create_index(
        [("sae_name", pymongo.ASCENDING), ("sae_series", pymongo.ASCENDING), ("feature_index", pymongo.ASCENDING)],
        unique=True,
    )
    self.bookmark_collection.create_index([("created_at", pymongo.DESCENDING)])
    self.circuit_collection.create_index([("sae_series", pymongo.ASCENDING)])
    self.circuit_collection.create_index([("created_at", pymongo.DESCENDING)])

    # Initialize GridFS by default
    self._init_fs()

enable_gridfs

enable_gridfs() -> None

Enable GridFS for storing large binary data.

Source code in src/lm_saes/database.py

def enable_gridfs(self) -> None:
    """Enable GridFS for storing large binary data."""
    if self.fs is None:
        self._init_fs()

disable_gridfs

disable_gridfs() -> None

Disable GridFS usage.

Source code in src/lm_saes/database.py

def disable_gridfs(self) -> None:
    """Disable GridFS usage."""
    self.fs = None

is_gridfs_enabled

is_gridfs_enabled() -> bool

Check if GridFS is enabled.

Source code in src/lm_saes/database.py

def is_gridfs_enabled(self) -> bool:
    """Check if GridFS is enabled."""
    return self.fs is not None

update_sae

update_sae(
    name: str, series: str, update_data: dict[str, Any]
)

Update an SAE and all its references.

If the name is updated, all references in other collections are also updated within a transaction.

Source code in src/lm_saes/database.py

def update_sae(self, name: str, series: str, update_data: dict[str, Any]):
    """Update an SAE and all its references.

    If the name is updated, all references in other collections are also updated within a transaction.
    """
    new_name = update_data.get("name")
    if new_name and new_name != name:
        with self.client.start_session() as session:
            with session.start_transaction():
                self.feature_collection.update_many(
                    {"sae_name": name, "sae_series": series}, {"$set": {"sae_name": new_name}}, session=session
                )
                self.analysis_collection.update_many(
                    {"sae_name": name, "sae_series": series}, {"$set": {"sae_name": new_name}}, session=session
                )
                self.bookmark_collection.update_many(
                    {"sae_name": name, "sae_series": series}, {"$set": {"sae_name": new_name}}, session=session
                )
                self.sae_set_collection.update_many(
                    {"sae_names": name, "sae_series": series},
                    {"$set": {"sae_names.$[elem]": new_name}},
                    array_filters=[{"elem": name}],
                    session=session,
                )
                self.circuit_collection.update_many(
                    {"clt_names": name, "sae_series": series},
                    {"$set": {"clt_names.$[elem]": new_name}},
                    array_filters=[{"elem": name}],
                    session=session,
                )
                self.circuit_collection.update_many(
                    {"lorsa_names": name, "sae_series": series},
                    {"$set": {"lorsa_names.$[elem]": new_name}},
                    array_filters=[{"elem": name}],
                    session=session,
                )
                self.sae_collection.update_one(
                    {"name": name, "series": series}, {"$set": update_data}, session=session
                )
    else:
        self.sae_collection.update_one({"name": name, "series": series}, {"$set": update_data})

update_sae_set

update_sae_set(name: str, update_data: dict[str, Any])

Update an SAE set and all its references.

If the name is updated, all references in other collections are also updated within a transaction.

Source code in src/lm_saes/database.py

def update_sae_set(self, name: str, update_data: dict[str, Any]):
    """Update an SAE set and all its references.

    If the name is updated, all references in other collections are also updated within a transaction.
    """
    new_name = update_data.get("name")
    if new_name and new_name != name:
        with self.client.start_session() as session:
            with session.start_transaction():
                self.circuit_collection.update_many(
                    {"sae_set_name": name}, {"$set": {"sae_set_name": new_name}}, session=session
                )
                self.sae_set_collection.update_one({"name": name}, {"$set": update_data}, session=session)
    else:
        self.sae_set_collection.update_one({"name": name}, {"$set": update_data})

get_random_alive_feature

get_random_alive_feature(
    sae_name: str,
    sae_series: str,
    name: str | None = None,
    metric_filters: dict[str, dict[str, float]]
    | None = None,
) -> FeatureRecord | None

Get a random feature that has non-zero activation.

Parameters:

Name	Type	Description	Default
`sae_name`	`str`	Name of the SAE model	required
`sae_series`	`str`	Series of the SAE model	required
`name`	`str \| None`	Name of the analysis	`None`
`metric_filters`	`dict[str, dict[str, float]] \| None`	Optional dict of metric filters in the format {"metric_name": {"\(gte": value, "\)lte": value}}	`None`

Returns:

Type	Description
`FeatureRecord \| None`	A random feature record with non-zero activation, or None if no such feature exists

Source code in src/lm_saes/database.py

def get_random_alive_feature(
    self,
    sae_name: str,
    sae_series: str,
    name: str | None = None,
    metric_filters: Optional[dict[str, dict[str, float]]] = None,
) -> Optional[FeatureRecord]:
    """Get a random feature that has non-zero activation.

    Args:
        sae_name: Name of the SAE model
        sae_series: Series of the SAE model
        name: Name of the analysis
        metric_filters: Optional dict of metric filters in the format {"metric_name": {"$gte": value, "$lte": value}}

    Returns:
        A random feature record with non-zero activation, or None if no such feature exists
    """
    elem_match: dict[str, Any] = {"max_feature_acts": {"$gt": 0}}
    if name is not None:
        elem_match["name"] = name

    match_filter: dict[str, Any] = {
        "sae_name": sae_name,
        "sae_series": sae_series,
        "analyses": {"$elemMatch": elem_match},
    }

    # Add metric filters if provided
    if metric_filters:
        for metric_name, filters in metric_filters.items():
            match_filter[f"metric.{metric_name}"] = filters

    pipeline = [
        {"$match": match_filter},
        {"$sample": {"size": 1}},
    ]
    feature = next(self.feature_collection.aggregate(pipeline), None)
    if feature is None:
        return None

    # Convert GridFS references back to numpy arrays
    if self.is_gridfs_enabled():
        feature = self._from_gridfs(feature)

    return FeatureRecord.model_validate(feature)

update_feature

update_feature(
    sae_name: str,
    feature_index: int,
    update_data: dict,
    sae_series: str | None = None,
) -> UpdateResult

Update a feature with additional data.

Parameters:

Name	Type	Description	Default
`sae_name`	`str`	Name of the SAE	required
`feature_index`	`int`	Index of the feature to update	required
`update_data`	`dict`	Dictionary with data to update	required
`sae_series`	`str \| None`	Optional series of the SAE	`None`

Returns:

Type	Description
`UpdateResult`	Result of the update operation.

Raises:

Type	Description
`ValueError`	If the feature doesn't exist

Source code in src/lm_saes/database.py

def update_feature(
    self, sae_name: str, feature_index: int, update_data: dict, sae_series: str | None = None
) -> pymongo.results.UpdateResult:
    """Update a feature with additional data.

    Args:
        sae_name: Name of the SAE
        feature_index: Index of the feature to update
        update_data: Dictionary with data to update
        sae_series: Optional series of the SAE

    Returns:
        Result of the update operation.

    Raises:
        ValueError: If the feature doesn't exist
    """
    # Ensure we have a non-None sae_series
    if sae_series is None:
        raise ValueError("sae_series cannot be None")

    feature = self.get_feature(sae_name, sae_series, feature_index)
    if feature is None:
        raise ValueError(f"Feature {feature_index} not found for SAE {sae_name}/{sae_series}")

    # Initialize GridFS if not already done
    if not self.is_gridfs_enabled():
        self.enable_gridfs()

    # Convert numpy arrays to GridFS references
    processed_update_data = self._to_gridfs(update_data)

    result = self.feature_collection.update_one(
        {"sae_name": sae_name, "sae_series": sae_series, "index": feature_index}, {"$set": processed_update_data}
    )

    return result

add_bookmark

add_bookmark(
    sae_name: str,
    sae_series: str,
    feature_index: int,
    tags: list[str] | None = None,
    notes: str | None = None,
) -> bool

Add a bookmark for a feature.

Parameters:

Name	Type	Description	Default
`sae_name`	`str`	Name of the SAE	required
`sae_series`	`str`	Series of the SAE	required
`feature_index`	`int`	Index of the feature to bookmark	required
`tags`	`list[str] \| None`	Optional list of tags for the bookmark	`None`
`notes`	`str \| None`	Optional notes for the bookmark	`None`

Returns:

Name	Type	Description
`bool`	`bool`	True if bookmark was added, False if it already exists

Raises:

Type	Description
`ValueError`	If the feature doesn't exist

Source code in src/lm_saes/database.py

def add_bookmark(
    self,
    sae_name: str,
    sae_series: str,
    feature_index: int,
    tags: Optional[list[str]] = None,
    notes: Optional[str] = None,
) -> bool:
    """Add a bookmark for a feature.

    Args:
        sae_name: Name of the SAE
        sae_series: Series of the SAE
        feature_index: Index of the feature to bookmark
        tags: Optional list of tags for the bookmark
        notes: Optional notes for the bookmark

    Returns:
        bool: True if bookmark was added, False if it already exists

    Raises:
        ValueError: If the feature doesn't exist
    """
    # Check if feature exists
    feature = self.get_feature(sae_name, sae_series, feature_index)
    if feature is None:
        raise ValueError(f"Feature {feature_index} not found for SAE {sae_name}/{sae_series}")

    bookmark_data = {
        "sae_name": sae_name,
        "sae_series": sae_series,
        "feature_index": feature_index,
        "created_at": datetime.utcnow(),
        "tags": tags or [],
        "notes": notes,
    }

    try:
        result = self.bookmark_collection.insert_one(bookmark_data)
        return result.inserted_id is not None
    except pymongo.errors.DuplicateKeyError:
        return False

remove_bookmark

remove_bookmark(
    sae_name: str, sae_series: str, feature_index: int
) -> bool

Remove a bookmark for a feature.

Parameters:

Name	Type	Description	Default
`sae_name`	`str`	Name of the SAE	required
`sae_series`	`str`	Series of the SAE	required
`feature_index`	`int`	Index of the feature to remove bookmark from	required

Returns:

Name	Type	Description
`bool`	`bool`	True if bookmark was removed, False if it didn't exist

Source code in src/lm_saes/database.py

def remove_bookmark(self, sae_name: str, sae_series: str, feature_index: int) -> bool:
    """Remove a bookmark for a feature.

    Args:
        sae_name: Name of the SAE
        sae_series: Series of the SAE
        feature_index: Index of the feature to remove bookmark from

    Returns:
        bool: True if bookmark was removed, False if it didn't exist
    """
    result = self.bookmark_collection.delete_one(
        {
            "sae_name": sae_name,
            "sae_series": sae_series,
            "feature_index": feature_index,
        }
    )
    return result.deleted_count > 0

is_bookmarked

is_bookmarked(
    sae_name: str, sae_series: str, feature_index: int
) -> bool

Check if a feature is bookmarked.

Parameters:

Name	Type	Description	Default
`sae_name`	`str`	Name of the SAE	required
`sae_series`	`str`	Series of the SAE	required
`feature_index`	`int`	Index of the feature	required

Returns:

Name	Type	Description
`bool`	`bool`	True if the feature is bookmarked, False otherwise

Source code in src/lm_saes/database.py

def is_bookmarked(self, sae_name: str, sae_series: str, feature_index: int) -> bool:
    """Check if a feature is bookmarked.

    Args:
        sae_name: Name of the SAE
        sae_series: Series of the SAE
        feature_index: Index of the feature

    Returns:
        bool: True if the feature is bookmarked, False otherwise
    """
    bookmark = self.bookmark_collection.find_one(
        {
            "sae_name": sae_name,
            "sae_series": sae_series,
            "feature_index": feature_index,
        }
    )
    return bookmark is not None

get_bookmark

get_bookmark(
    sae_name: str, sae_series: str, feature_index: int
) -> BookmarkRecord | None

Get a specific bookmark.

Parameters:

Name	Type	Description	Default
`sae_name`	`str`	Name of the SAE	required
`sae_series`	`str`	Series of the SAE	required
`feature_index`	`int`	Index of the feature	required

Returns:

Name	Type	Description
`BookmarkRecord`	`BookmarkRecord \| None`	The bookmark record if it exists, None otherwise

Source code in src/lm_saes/database.py

def get_bookmark(self, sae_name: str, sae_series: str, feature_index: int) -> Optional[BookmarkRecord]:
    """Get a specific bookmark.

    Args:
        sae_name: Name of the SAE
        sae_series: Series of the SAE
        feature_index: Index of the feature

    Returns:
        BookmarkRecord: The bookmark record if it exists, None otherwise
    """
    bookmark = self.bookmark_collection.find_one(
        {
            "sae_name": sae_name,
            "sae_series": sae_series,
            "feature_index": feature_index,
        }
    )
    if bookmark is None:
        return None
    return BookmarkRecord.model_validate(bookmark)

list_bookmarks

list_bookmarks(
    sae_name: str | None = None,
    sae_series: str | None = None,
    tags: list[str] | None = None,
    limit: int | None = None,
    skip: int = 0,
) -> list[BookmarkRecord]

List bookmarks with optional filtering.

Source code in src/lm_saes/database.py

def list_bookmarks(
    self,
    sae_name: Optional[str] = None,
    sae_series: Optional[str] = None,
    tags: Optional[list[str]] = None,
    limit: Optional[int] = None,
    skip: int = 0,
) -> list[BookmarkRecord]:
    """List bookmarks with optional filtering."""
    query = {}

    if sae_name is not None:
        query["sae_name"] = sae_name
    if sae_series is not None:
        query["sae_series"] = sae_series
    if tags:
        query["tags"] = {"$in": tags}

    cursor = self.bookmark_collection.find(query).sort("created_at", pymongo.DESCENDING)

    if skip > 0:
        cursor = cursor.skip(skip)
    if limit is not None:
        cursor = cursor.limit(limit)

    return [BookmarkRecord.model_validate(bookmark) for bookmark in cursor]

update_bookmark

update_bookmark(
    sae_name: str,
    sae_series: str,
    feature_index: int,
    tags: list[str] | None = None,
    notes: str | None = None,
) -> bool

Update an existing bookmark.

Source code in src/lm_saes/database.py

def update_bookmark(
    self,
    sae_name: str,
    sae_series: str,
    feature_index: int,
    tags: Optional[list[str]] = None,
    notes: Optional[str] = None,
) -> bool:
    """Update an existing bookmark."""
    update_data = {}
    if tags is not None:
        update_data["tags"] = tags
    if notes is not None:
        update_data["notes"] = notes

    if not update_data:
        return True  # Nothing to update

    result = self.bookmark_collection.update_one(
        {
            "sae_name": sae_name,
            "sae_series": sae_series,
            "feature_index": feature_index,
        },
        {"$set": update_data},
    )
    return result.modified_count > 0

get_bookmark_count

get_bookmark_count(
    sae_name: str | None = None,
    sae_series: str | None = None,
) -> int

Get the total count of bookmarks with optional filtering.

Source code in src/lm_saes/database.py

def get_bookmark_count(self, sae_name: Optional[str] = None, sae_series: Optional[str] = None) -> int:
    """Get the total count of bookmarks with optional filtering."""
    query = {}
    if sae_name is not None:
        query["sae_name"] = sae_name
    if sae_series is not None:
        query["sae_series"] = sae_series

    return self.bookmark_collection.count_documents(query)

get_available_metrics

get_available_metrics(
    sae_name: str, sae_series: str
) -> list[str]

Get available metrics for an SAE by checking the first feature.

Parameters:

Name	Type	Description	Default
`sae_name`	`str`	Name of the SAE model	required
`sae_series`	`str`	Series of the SAE model	required

Returns:

Type	Description
`list[str]`	List of available metric names

Source code in src/lm_saes/database.py

def get_available_metrics(self, sae_name: str, sae_series: str) -> list[str]:
    """Get available metrics for an SAE by checking the first feature.

    Args:
        sae_name: Name of the SAE model
        sae_series: Series of the SAE model

    Returns:
        List of available metric names
    """
    # Use projection to avoid loading large arrays from analyses[0].samplings
    projection = {
        "metric": 1,
    }

    first_feature = self.feature_collection.find_one({"sae_name": sae_name, "sae_series": sae_series}, projection)

    if first_feature is None or first_feature.get("metric") is None:
        return []

    return list(first_feature["metric"].keys())

count_features_with_filters

count_features_with_filters(
    sae_name: str,
    sae_series: str,
    name: str | None = None,
    metric_filters: dict[str, dict[str, float]]
    | None = None,
) -> int

Count features that match the given filters.

Parameters:

Name	Type	Description	Default
`sae_name`	`str`	Name of the SAE model	required
`sae_series`	`str`	Series of the SAE model	required
`name`	`str \| None`	Name of the analysis	`None`
`metric_filters`	`dict[str, dict[str, float]] \| None`	Optional dict of metric filters in the format {"metric_name": {"\(gte": value, "\)lte": value}}	`None`

Returns:

Type	Description
`int`	Number of features matching the filters

Source code in src/lm_saes/database.py

def count_features_with_filters(
    self,
    sae_name: str,
    sae_series: str,
    name: str | None = None,
    metric_filters: Optional[dict[str, dict[str, float]]] = None,
) -> int:
    """Count features that match the given filters.

    Args:
        sae_name: Name of the SAE model
        sae_series: Series of the SAE model
        name: Name of the analysis
        metric_filters: Optional dict of metric filters in the format {"metric_name": {"$gte": value, "$lte": value}}

    Returns:
        Number of features matching the filters
    """
    elem_match: dict[str, Any] = {"max_feature_acts": {"$gt": 0}}
    if name is not None:
        elem_match["name"] = name

    match_filter: dict[str, Any] = {
        "sae_name": sae_name,
        "sae_series": sae_series,
        "analyses": {"$elemMatch": elem_match},
    }

    # Add metric filters if provided
    if metric_filters:
        for metric_name, filters in metric_filters.items():
            match_filter[f"metric.{metric_name}"] = filters

    return self.feature_collection.count_documents(match_filter)

create_circuit

create_circuit(
    sae_set_name: str,
    sae_series: str,
    prompt: str,
    input: CircuitInput,
    config: CircuitConfig,
    name: str | None = None,
    group: str | None = None,
    parent_id: str | None = None,
) -> str

Create a new circuit record with pending status.

Source code in src/lm_saes/database.py

def create_circuit(
    self,
    sae_set_name: str,
    sae_series: str,
    prompt: str,
    input: CircuitInput,
    config: CircuitConfig,
    name: Optional[str] = None,
    group: Optional[str] = None,
    parent_id: Optional[str] = None,
) -> str:
    """Create a new circuit record with pending status."""
    result = self.circuit_collection.insert_one(
        {
            "name": name,
            "group": group,
            "parent_id": parent_id,
            "sae_set_name": sae_set_name,
            "sae_series": sae_series,
            "prompt": prompt,
            "input": input.model_dump(),
            "config": config.model_dump(),
            "created_at": datetime.utcnow(),
            "status": CircuitStatus.PENDING,
            "progress": 0.0,
            "progress_phase": None,
            "error_message": None,
            "attribution_id": None,
        }
    )
    return str(result.inserted_id)

get_circuit

get_circuit(circuit_id: str) -> CircuitRecord | None

Get a circuit by its ID.

Source code in src/lm_saes/database.py

def get_circuit(self, circuit_id: str) -> Optional[CircuitRecord]:
    """Get a circuit by its ID."""
    try:
        circuit = self.circuit_collection.find_one({"_id": ObjectId(circuit_id)})
    except Exception:
        return None
    if circuit is None:
        return None
    circuit["id"] = str(circuit.pop("_id"))
    return CircuitRecord.model_validate(circuit)

list_circuits

list_circuits(
    sae_series: str | None = None,
    group: str | None = None,
    limit: int | None = None,
    skip: int = 0,
) -> list[dict[str, Any]]

List circuits with optional filtering.

Note: raw_graph_id is excluded from the listing for efficiency.

Source code in src/lm_saes/database.py

def list_circuits(
    self,
    sae_series: Optional[str] = None,
    group: Optional[str] = None,
    limit: Optional[int] = None,
    skip: int = 0,
) -> list[dict[str, Any]]:
    """List circuits with optional filtering.

    Note: raw_graph_id is excluded from the listing for efficiency.
    """
    query: dict[str, Any] = {}
    if sae_series is not None:
        query["sae_series"] = sae_series
    if group is not None:
        query["group"] = group

    # Exclude raw_graph_id from listing
    projection = {"attribution_id": 0}

    cursor = self.circuit_collection.find(query, projection=projection).sort("created_at", pymongo.DESCENDING)

    if skip > 0:
        cursor = cursor.skip(skip)
    if limit is not None:
        cursor = cursor.limit(limit)

    circuits = []
    for circuit in cursor:
        circuit["id"] = str(circuit.pop("_id"))
        circuits.append(circuit)
    return circuits

update_circuits_group

update_circuits_group(
    circuit_ids: list[str], group: str | None
) -> int

Update the group for multiple circuits.

Source code in src/lm_saes/database.py

def update_circuits_group(self, circuit_ids: list[str], group: Optional[str]) -> int:
    """Update the group for multiple circuits."""
    object_ids = [ObjectId(cid) for cid in circuit_ids]
    result = self.circuit_collection.update_many({"_id": {"$in": object_ids}}, {"$set": {"group": group}})
    return result.modified_count

update_circuit

update_circuit(
    circuit_id: str, update_data: dict[str, Any]
) -> bool

Update a circuit by its ID.

Source code in src/lm_saes/database.py

def update_circuit(self, circuit_id: str, update_data: dict[str, Any]) -> bool:
    """Update a circuit by its ID."""
    result = self.circuit_collection.update_one({"_id": ObjectId(circuit_id)}, {"$set": update_data})
    return result.modified_count > 0

delete_circuit

delete_circuit(circuit_id: str) -> bool

Delete a circuit by its ID.

Also deletes the associated attribution from GridFS if it exists.

Source code in src/lm_saes/database.py

def delete_circuit(self, circuit_id: str) -> bool:
    """Delete a circuit by its ID.

    Also deletes the associated attribution from GridFS if it exists.
    """
    try:
        # First get the circuit to find attribution_id
        circuit = self.circuit_collection.find_one({"_id": ObjectId(circuit_id)})
        if circuit and circuit.get("attribution_id") and self.fs:
            try:
                self.fs.delete(ObjectId(circuit["attribution_id"]))
            except Exception:
                pass  # Ignore errors when deleting GridFS file

        result = self.circuit_collection.delete_one({"_id": ObjectId(circuit_id)})
    except Exception:
        return False
    return result.deleted_count > 0

update_circuit_progress

update_circuit_progress(
    circuit_id: str,
    progress: float,
    progress_phase: str | None = None,
) -> bool

Update the progress of a circuit generation.

Parameters:

Name	Type	Description	Default
`circuit_id`	`str`	The circuit ID.	required
`progress`	`float`	Progress percentage (0-100).	required
`progress_phase`	`str \| None`	Optional phase description.	`None`

Returns:

Type	Description
`bool`	True if update was successful.

Source code in src/lm_saes/database.py

def update_circuit_progress(
    self,
    circuit_id: str,
    progress: float,
    progress_phase: Optional[str] = None,
) -> bool:
    """Update the progress of a circuit generation.

    Args:
        circuit_id: The circuit ID.
        progress: Progress percentage (0-100).
        progress_phase: Optional phase description.

    Returns:
        True if update was successful.
    """
    update_data: dict[str, Any] = {"progress": progress}
    if progress_phase is not None:
        update_data["progress_phase"] = progress_phase

    result = self.circuit_collection.update_one(
        {"_id": ObjectId(circuit_id)},
        {"$set": update_data},
    )
    return result.modified_count > 0

update_circuit_status

update_circuit_status(
    circuit_id: str,
    status: str,
    error_message: str | None = None,
) -> bool

Update the status of a circuit.

Parameters:

Name	Type	Description	Default
`circuit_id`	`str`	The circuit ID.	required
`status`	`str`	New status (pending, running, completed, failed).	required
`error_message`	`str \| None`	Optional error message for failed status.	`None`

Returns:

Type	Description
`bool`	True if update was successful.

Source code in src/lm_saes/database.py

def update_circuit_status(
    self,
    circuit_id: str,
    status: str,
    error_message: Optional[str] = None,
) -> bool:
    """Update the status of a circuit.

    Args:
        circuit_id: The circuit ID.
        status: New status (pending, running, completed, failed).
        error_message: Optional error message for failed status.

    Returns:
        True if update was successful.
    """
    update_data: dict[str, Any] = {"status": status}
    if error_message is not None:
        update_data["error_message"] = error_message

    result = self.circuit_collection.update_one(
        {"_id": ObjectId(circuit_id)},
        {"$set": update_data},
    )
    return result.modified_count > 0

store_attribution

store_attribution(
    circuit_id: str, attribution: AttributionResult
) -> bool

Store attribution data to GridFS and update circuit record.

Source code in src/lm_saes/database.py

def store_attribution(self, circuit_id: str, attribution: AttributionResult) -> bool:
    """Store attribution data to GridFS and update circuit record."""
    assert self.fs is not None

    attribution_bytes = pickle.dumps(attribution)
    attribution_id = self.fs.put(attribution_bytes, filename=f"circuit_{circuit_id}_attribution")

    result = self.circuit_collection.update_one(
        {"_id": ObjectId(circuit_id)},
        {"$set": {"attribution_id": attribution_id}},
    )
    return result.modified_count > 0

load_attribution

load_attribution(
    circuit_id: str,
) -> AttributionResult | None

Load attribution data from GridFS.

Source code in src/lm_saes/database.py

@timer.time("load_attribution")
def load_attribution(self, circuit_id: str) -> Optional[AttributionResult]:
    """Load attribution data from GridFS."""
    assert self.fs is not None

    circuit = self.circuit_collection.find_one({"_id": ObjectId(circuit_id)})
    if circuit is None or circuit.get("attribution_id") is None:
        return None
    attribution_bytes = self.fs.get(circuit["attribution_id"]).read()
    attribution = pickle.loads(attribution_bytes)
    return attribution

get_circuit_status

get_circuit_status(
    circuit_id: str,
) -> dict[str, Any] | None

Get just the status information for a circuit.

Source code in src/lm_saes/database.py

def get_circuit_status(self, circuit_id: str) -> Optional[dict[str, Any]]:
    """Get just the status information for a circuit."""
    circuit = self.circuit_collection.find_one(
        {"_id": ObjectId(circuit_id)},
        projection={
            "status": 1,
            "progress": 1,
            "progress_phase": 1,
            "error_message": 1,
        },
    )

    if circuit is None:
        return None

    return {
        "status": circuit.get("status", CircuitStatus.PENDING),
        "progress": circuit.get("progress", 0.0),
        "progress_phase": circuit.get("progress_phase"),
        "error_message": circuit.get("error_message"),
    }

Infrastructure

LanguageModelConfig pydantic-model

model_name pydantic-field

model_from_pretrained_path pydantic-field

use_flash_attn pydantic-field

cache_dir pydantic-field

local_files_only pydantic-field

max_length pydantic-field

backend pydantic-field

tokenizer_only pydantic-field

prepend_bos pydantic-field

bos_token_id pydantic-field

eos_token_id pydantic-field

pad_token_id pydantic-field

from_pretrained_sae staticmethod

TransformerLensLanguageModel

run_with_cache_until

HuggingFaceLanguageModel

DatasetConfig pydantic-model

dataset_name_or_path pydantic-field

cache_dir pydantic-field

is_dataset_on_disk pydantic-field

MongoDBConfig pydantic-model

MongoClient

enable_gridfs

disable_gridfs

is_gridfs_enabled

update_sae

update_sae_set

get_random_alive_feature

update_feature

add_bookmark

remove_bookmark

is_bookmarked

get_bookmark

list_bookmarks

update_bookmark

get_bookmark_count

get_available_metrics

count_features_with_filters

create_circuit

get_circuit

list_circuits

update_circuits_group

update_circuit

delete_circuit

update_circuit_progress

update_circuit_status

store_attribution

load_attribution

get_circuit_status

LanguageModelConfig `pydantic-model`

model_name `pydantic-field`

model_from_pretrained_path `pydantic-field`

use_flash_attn `pydantic-field`

cache_dir `pydantic-field`

local_files_only `pydantic-field`

max_length `pydantic-field`

backend `pydantic-field`

tokenizer_only `pydantic-field`

prepend_bos `pydantic-field`

bos_token_id `pydantic-field`

eos_token_id `pydantic-field`

pad_token_id `pydantic-field`

from_pretrained_sae `staticmethod`

DatasetConfig `pydantic-model`

dataset_name_or_path `pydantic-field`

cache_dir `pydantic-field`

is_dataset_on_disk `pydantic-field`

MongoDBConfig `pydantic-model`