Source code for design_research_agents._memory._graph_extraction

"""Heuristic graph extraction helpers for design relationship text."""

from __future__ import annotations

import re

from design_research_agents._contracts._memory import GraphEdgeRecord, GraphNodeRecord

_RELATION_PATTERNS: tuple[tuple[re.Pattern[str], str], ...] = (
    (
        re.compile(
            r"(?P<left>[A-Za-z][A-Za-z0-9 _'/-]{0,80})\s+is connected to\s+(?P<right>[A-Za-z][A-Za-z0-9 _'/-]{0,80})",
            re.IGNORECASE,
        ),
        "connected_to",
    ),
    (
        re.compile(
            r"(?P<left>[A-Za-z][A-Za-z0-9 _'/-]{0,80})\s+connects to\s+(?P<right>[A-Za-z][A-Za-z0-9 _'/-]{0,80})",
            re.IGNORECASE,
        ),
        "connected_to",
    ),
    (
        re.compile(
            r"(?P<left>[A-Za-z][A-Za-z0-9 _'/-]{0,80})\s+drives\s+(?P<right>[A-Za-z][A-Za-z0-9 _'/-]{0,80})",
            re.IGNORECASE,
        ),
        "drives",
    ),
    (
        re.compile(
            r"(?P<left>[A-Za-z][A-Za-z0-9 _'/-]{0,80})\s+powers\s+(?P<right>[A-Za-z][A-Za-z0-9 _'/-]{0,80})",
            re.IGNORECASE,
        ),
        "powers",
    ),
    (
        re.compile(
            r"(?P<left>[A-Za-z][A-Za-z0-9 _'/-]{0,80})\s+supports\s+(?P<right>[A-Za-z][A-Za-z0-9 _'/-]{0,80})",
            re.IGNORECASE,
        ),
        "supports",
    ),
    (
        re.compile(
            r"(?P<left>[A-Za-z][A-Za-z0-9 _'/-]{0,80})\s+uses\s+(?P<right>[A-Za-z][A-Za-z0-9 _'/-]{0,80})",
            re.IGNORECASE,
        ),
        "uses",
    ),
    (
        re.compile(
            r"(?P<left>[A-Za-z][A-Za-z0-9 _'/-]{0,80})\s+depends on\s+(?P<right>[A-Za-z][A-Za-z0-9 _'/-]{0,80})",
            re.IGNORECASE,
        ),
        "depends_on",
    ),
    (
        re.compile(
            r"(?P<left>[A-Za-z][A-Za-z0-9 _'/-]{0,80})\s+defines\s+(?P<right>[A-Za-z][A-Za-z0-9 _'/-]{0,80})",
            re.IGNORECASE,
        ),
        "defines",
    ),
)



[docs]
def extract_graph_records_from_text(text: str) -> tuple[list[GraphNodeRecord], list[GraphEdgeRecord]]:
    """Extract nodes and relationships from simple design statements.

    The extraction is intentionally heuristic and deterministic. It is useful
    for bootstrapping graph memory from structured requirement text, but it is
    not intended to replace model-based information extraction.

    Args:
        text: Source text containing simple relationship statements.

    Returns:
        Tuple ``(nodes, edges)`` extracted from the text.
    """
    nodes_by_id: dict[str, GraphNodeRecord] = {}
    edges: list[GraphEdgeRecord] = []

    normalized_sentences = _split_sentences(text)
    for sentence in normalized_sentences:
        for pattern, relationship in _RELATION_PATTERNS:
            match = pattern.search(sentence)
            if match is None:
                continue
            left_name = _normalize_entity_name(match.group("left"))
            right_name = _normalize_entity_name(match.group("right"))
            if not left_name or not right_name or left_name == right_name:
                continue

            left_node = _build_node(left_name)
            right_node = _build_node(right_name)
            nodes_by_id[left_node.node_id] = left_node
            nodes_by_id[right_node.node_id] = right_node
            edges.append(
                GraphEdgeRecord(
                    source_id=left_node.node_id,
                    target_id=right_node.node_id,
                    relationship=relationship,
                    metadata={"source_sentence": sentence},
                )
            )
            break

    return list(nodes_by_id.values()), edges



def _split_sentences(text: str) -> list[str]:
    """Split free text into small sentence-like chunks."""
    segments = re.split(r"[.;\n]+", text)
    return [segment.strip() for segment in segments if segment.strip()]


def _normalize_entity_name(value: str) -> str:
    """Normalize one extracted entity name."""
    collapsed = " ".join(value.strip().split())
    if not collapsed:
        return ""
    return collapsed


def _build_node(name: str) -> GraphNodeRecord:
    """Return one graph node for a normalized entity name."""
    return GraphNodeRecord(
        node_id=_slugify(name),
        name=name,
        node_type="component",
    )


def _slugify(value: str) -> str:
    """Return a deterministic identifier derived from ``value``."""
    normalized = re.sub(r"[^a-z0-9]+", "-", value.lower()).strip("-")
    return normalized or "entity"


__all__ = ["extract_graph_records_from_text"]