neo4j
diff --git a/‎CHANGELOG.md
Lines changed: 5 additions & 0 deletions b/‎CHANGELOG.md
Lines changed: 5 additions & 0 deletions
diff --git a/‎docs/source/api.rst
Lines changed: 13 additions & 0 deletions b/‎docs/source/api.rst
Lines changed: 13 additions & 0 deletions
diff --git a/‎docs/source/user_guide_kg_builder.rst
Lines changed: 119 additions & 68 deletions b/‎docs/source/user_guide_kg_builder.rst
Lines changed: 119 additions & 68 deletions
diff --git a/‎examples/README.md
Lines changed: 2 additions & 0 deletions b/‎examples/README.md
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/build_graph/automatic_schema_extraction/simple_kg_builder_schema_from_pdf.py
Lines changed: 97 additions & 0 deletions b/‎examples/build_graph/automatic_schema_extraction/simple_kg_builder_schema_from_pdf.py
Lines changed: 97 additions & 0 deletions
@@ -2,10 +2,15 @@
 
 ## Next
 
+### Added
+
+- Added support for automatic schema extraction from text using LLMs. In the `SimpleKGPipeline`, when the user provides no schema, the automatic schema extraction is enabled by default. 
+
 ### Fixed
 
 - Fixed a bug where `spacy` and `rapidfuzz` needed to be installed even if not using the relevant entity resolvers.
 
+
 ## 1.7.0
 
 ### Added
 
@@ -77,6 +77,12 @@ SchemaBuilder
 .. autoclass:: neo4j_graphrag.experimental.components.schema.SchemaBuilder
     :members: run
 
+SchemaFromTextExtractor
+-----------------------
+
+.. autoclass:: neo4j_graphrag.experimental.components.schema.SchemaFromTextExtractor
+    :members: run
+
 EntityRelationExtractor
 =======================
 
@@ -362,6 +368,13 @@ ERExtractionTemplate
     :members:
     :exclude-members: format
 
+SchemaExtractionTemplate
+------------------------
+
+.. autoclass:: neo4j_graphrag.generation.prompts.SchemaExtractionTemplate
+    :members:
+    :exclude-members: format
+
 Text2CypherTemplate
 --------------------
 
 
@@ -21,7 +21,7 @@ A Knowledge Graph (KG) construction pipeline requires a few components (some of
 - **Data loader**: extract text from files (PDFs, ...).
 - **Text splitter**: split the text into smaller pieces of text (chunks), manageable by the LLM context window (token limit).
 - **Chunk embedder** (optional): compute the chunk embeddings.
-- **Schema builder**: provide a schema to ground the LLM extracted entities and relations and obtain an easily navigable KG.
+- **Schema builder**: provide a schema to ground the LLM extracted entities and relations and obtain an easily navigable KG. Schema can be provided manually or extracted automatically using LLMs.
 - **Lexical graph builder**: build the lexical graph (Document, Chunk and their relationships) (optional).
 - **Entity and relation extractor**: extract relevant entities and relations from the text.
 - **Knowledge Graph writer**: save the identified entities and relations.
@@ -75,10 +75,11 @@ Graph Schema
 
 It is possible to guide the LLM by supplying a list of entities, relationships,
 and instructions on how to connect them. However, note that the extracted graph
-may not fully adhere to these guidelines. Entities and relationships can be
-represented as either simple strings (for their labels) or dictionaries. If using
-a dictionary, it must include a label key and can optionally include description
-and properties keys, as shown below:
+may not fully adhere to these guidelines unless schema enforcement is enabled 
+(see :ref:`Schema Enforcement Behaviour`). Entities and relationships can be represented
+as either simple strings (for their labels) or dictionaries. If using a dictionary,
+it must include a label key and can optionally include description and properties keys,
+as shown below:
 
 .. code:: python
 
@@ -117,14 +118,20 @@ This schema information can be provided to the `SimpleKGBuilder` as demonstrated
 
 .. code:: python
 
+    # Using the schema parameter (recommended approach)
     kg_builder = SimpleKGPipeline(
         # ...
-        entities=ENTITIES,
-        relations=RELATIONS,
-        potential_schema=POTENTIAL_SCHEMA,
+        schema={
+            "entities": ENTITIES,
+            "relations": RELATIONS,
+            "potential_schema": POTENTIAL_SCHEMA
+        },
         # ...
     )
 
+.. note::
+   By default, if no schema is provided to the SimpleKGPipeline, automatic schema extraction will be performed using the LLM (See the :ref:`Automatic Schema Extraction with SchemaFromTextExtractor`).
+
 Extra configurations
 --------------------
 
@@ -412,41 +419,44 @@ within the configuration file.
         "neo4j_database": "myDb",
         "on_error": "IGNORE",
         "prompt_template": "...",
-        "entities": [
-            "Person",
-            {
-                "label": "House",
-                "description": "Family the person belongs to",
-                "properties": [
-                    {"name": "name", "type": "STRING"}
-                ]
-            },
-            {
-                "label": "Planet",
-                "properties": [
-                    {"name": "name", "type": "STRING"},
-                    {"name": "weather", "type": "STRING"}
-                ]
-            }
-        ],
-        "relations": [
-            "PARENT_OF",
-            {
-                "label": "HEIR_OF",
-                "description": "Used for inheritor relationship between father and sons"
-            },
-            {
-                "label": "RULES",
-                "properties": [
-                    {"name": "fromYear", "type": "INTEGER"}
-                ]
-            }
-        ],
-        "potential_schema": [
-            ["Person", "PARENT_OF", "Person"],
-            ["Person", "HEIR_OF", "House"],
-            ["House", "RULES", "Planet"]
-        ],
+        
+        "schema": {
+            "entities": [
+                "Person",
+                {
+                    "label": "House",
+                    "description": "Family the person belongs to",
+                    "properties": [
+                        {"name": "name", "type": "STRING"}
+                    ]
+                },
+                {
+                    "label": "Planet",
+                    "properties": [
+                        {"name": "name", "type": "STRING"},
+                        {"name": "weather", "type": "STRING"}
+                    ]
+                }
+            ],
+            "relations": [
+                "PARENT_OF",
+                {
+                    "label": "HEIR_OF",
+                    "description": "Used for inheritor relationship between father and sons"
+                },
+                {
+                    "label": "RULES",
+                    "properties": [
+                        {"name": "fromYear", "type": "INTEGER"}
+                    ]
+                }
+            ],
+            "potential_schema": [
+                ["Person", "PARENT_OF", "Person"],
+                ["Person", "HEIR_OF", "House"],
+                ["House", "RULES", "Planet"]
+            ]
+        },
         "lexical_graph_config": {
             "chunk_node_label": "TextPart"
         }
@@ -462,31 +472,32 @@ or in YAML:
     neo4j_database: myDb
     on_error: IGNORE
     prompt_template: ...
-    entities:
-      - label: Person
-      - label: House
-        description: Family the person belongs to
-        properties:
-          - name: name
-            type: STRING
-      - label: Planet
-        properties:
-          - name: name
-            type: STRING
-          - name: weather
-            type: STRING
-    relations:
-      - label: PARENT_OF
-      - label: HEIR_OF
-        description: Used for inheritor relationship between father and sons
-      - label: RULES
-        properties:
-          - name: fromYear
-            type: INTEGER
-    potential_schema:
-      - ["Person", "PARENT_OF", "Person"]
-      - ["Person", "HEIR_OF", "House"]
-      - ["House", "RULES", "Planet"]
+    schema:
+      entities:
+        - Person
+        - label: House
+          description: Family the person belongs to
+          properties:
+            - name: name
+              type: STRING
+        - label: Planet
+          properties:
+            - name: name
+              type: STRING
+            - name: weather
+              type: STRING
+      relations:
+        - PARENT_OF
+        - label: HEIR_OF
+          description: Used for inheritor relationship between father and sons
+        - label: RULES
+          properties:
+            - name: fromYear
+              type: INTEGER
+      potential_schema:
+        - ["Person", "PARENT_OF", "Person"]
+        - ["Person", "HEIR_OF", "House"]
+        - ["House", "RULES", "Planet"]
     lexical_graph_config:
         chunk_node_label: TextPart
 
@@ -791,6 +802,44 @@ Here is a code block illustrating these concepts:
 After validation, this schema is saved in a `SchemaConfig` object, whose dict representation is passed
 to the LLM.
 
+Automatic Schema Extraction 
+---------------------------
+
+Instead of manually defining the schema, you can use the `SchemaFromTextExtractor` component to automatically extract a schema from your text using an LLM:
+
+.. code:: python
+
+    from neo4j_graphrag.experimental.components.schema import SchemaFromTextExtractor
+    from neo4j_graphrag.llm import OpenAILLM
+
+    # Instantiate the automatic schema extractor component
+    schema_extractor = SchemaFromTextExtractor(
+        llm=OpenAILLM(
+            model_name="gpt-4o",
+            model_params={
+                "max_tokens": 2000,
+                "response_format": {"type": "json_object"},
+            },
+        )
+    )
+
+    # Extract the schema from the text
+    extracted_schema = await schema_extractor.run(text="Some text")
+
+The `SchemaFromTextExtractor` component analyzes the text and identifies entity types, relationship types, and their property types. It creates a complete `SchemaConfig` object that can be used in the same way as a manually defined schema.
+
+You can also save and reload the extracted schema:
+
+.. code:: python
+
+    # Save the schema to JSON or YAML files
+    schema_config.store_as_json("my_schema.json")
+    schema_config.store_as_yaml("my_schema.yaml")
+    
+    # Later, reload the schema from file
+    from neo4j_graphrag.experimental.components.schema import SchemaConfig
+    restored_schema = SchemaConfig.from_file("my_schema.json")  # or my_schema.yaml
+
 
 Entity and Relation Extractor
 =============================
@@ -832,6 +881,8 @@ The LLM to use can be customized, the only constraint is that it obeys the :ref:
 
 Schema Enforcement Behaviour
 ----------------------------
+.. _schema-enforcement-behaviour:
+
 By default, even if a schema is provided to guide the LLM in the entity and relation extraction, the LLM response is not validated against that schema.
 This behaviour can be changed by using the `enforce_schema` flag in the `LLMEntityRelationExtractor` constructor:
 
 
@@ -3,6 +3,7 @@
 This folder contains examples usage for the different features
 supported by the `neo4j-graphrag` package:
 
+- [Automatic Schema Extraction](#schema-extraction) from PDF or text
 - [Build Knowledge Graph](#build-knowledge-graph) from PDF or text
 - [Retrieve](#retrieve) information from the graph
 - [Question Answering](#answer-graphrag) (Q&A)
@@ -122,6 +123,7 @@ are listed in [the last section of this file](#customize).
 - [Chunk embedder]()
 - Schema Builder:
   - [User-defined](./customize/build_graph/components/schema_builders/schema.py)
+  - [Automatic schema extraction](./automatic_schema_extraction/schema_from_text.py)
 - Entity Relation Extractor:
   - [LLM-based](./customize/build_graph/components/extractors/llm_entity_relation_extractor.py)
   - [LLM-based with custom prompt](./customize/build_graph/components/extractors/llm_entity_relation_extractor_with_custom_prompt.py)
 
@@ -0,0 +1,97 @@
+"""This example demonstrates how to use SimpleKGPipeline with automatic schema extraction
+from a PDF file. When no schema is provided to SimpleKGPipeline, automatic schema extraction
+is performed using the LLM.
+
+Note: This example requires an OpenAI API key to be set in the .env file.
+"""
+
+import asyncio
+import logging
+import os
+from pathlib import Path
+from dotenv import load_dotenv
+import neo4j
+
+from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline
+from neo4j_graphrag.llm import OpenAILLM
+from neo4j_graphrag.embeddings import OpenAIEmbeddings
+
+# Load environment variables from .env file
+load_dotenv()
+
+# Configure logging
+logging.basicConfig()
+logging.getLogger("neo4j_graphrag").setLevel(logging.INFO)
+
+# PDF file path
+root_dir = Path(__file__).parents[2]
+PDF_FILE = str(
+    root_dir / "data" / "Harry Potter and the Chamber of Secrets Summary.pdf"
+)
+
+
+async def run_kg_pipeline_with_auto_schema() -> None:
+    """Run the SimpleKGPipeline with automatic schema extraction from a PDF file."""
+
+    # Define Neo4j connection
+    uri = os.getenv("NEO4J_URI", "neo4j://localhost:7687")
+    user = os.getenv("NEO4J_USER", "neo4j")
+    password = os.getenv("NEO4J_PASSWORD", "password")
+
+    # Define LLM parameters
+    llm_model_params = {
+        "max_tokens": 2000,
+        "response_format": {"type": "json_object"},
+        "temperature": 0,  # Lower temperature for more consistent output
+    }
+
+    # Initialize the Neo4j driver
+    driver = neo4j.GraphDatabase.driver(uri, auth=(user, password))
+
+    # Create the LLM instance
+    llm = OpenAILLM(
+        model_name="gpt-4o",
+        model_params=llm_model_params,
+    )
+
+    # Create the embedder instance
+    embedder = OpenAIEmbeddings()
+
+    try:
+        # Create a SimpleKGPipeline instance without providing a schema
+        # This will trigger automatic schema extraction
+        kg_builder = SimpleKGPipeline(
+            llm=llm,
+            driver=driver,
+            embedder=embedder,
+            from_pdf=True,
+        )
+
+        print(f"Processing PDF file: {PDF_FILE}")
+        # Run the pipeline on the PDF file
+        await kg_builder.run_async(file_path=PDF_FILE)
+
+    finally:
+        # Close connections
+        await llm.async_client.close()
+        driver.close()
+
+
+async def main() -> None:
+    """Run the example."""
+    # Create data directory if it doesn't exist
+    data_dir = root_dir / "data"
+    data_dir.mkdir(exist_ok=True)
+
+    # Check if the PDF file exists
+    if not Path(PDF_FILE).exists():
+        print(f"Warning: PDF file not found at {PDF_FILE}")
+        print("Please replace with a valid PDF file path.")
+        return
+
+    # Run the pipeline
+    await run_kg_pipeline_with_auto_schema()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())