diff --git a/app/Config/services.php b/app/Config/services.php index d7345823150..aafe0bacc99 100644 --- a/app/Config/services.php +++ b/app/Config/services.php @@ -22,6 +22,18 @@ // Callback URL for social authentication methods 'callback_url' => env('APP_URL', false), + // LLM Service + // Options: openai + 'llm' => env('LLM_SERVICE', ''), + + // OpenAI API-compatible service details + 'openai' => [ + 'endpoint' => env('OPENAI_ENDPOINT', 'https://api.openai.com'), + 'key' => env('OPENAI_KEY', ''), + 'embedding_model' => env('OPENAI_EMBEDDING_MODEL', 'text-embedding-3-small'), + 'query_model' => env('OPENAI_QUERY_MODEL', 'gpt-4o'), + ], + 'github' => [ 'client_id' => env('GITHUB_APP_ID', false), 'client_secret' => env('GITHUB_APP_SECRET', false), diff --git a/app/Console/Commands/RegenerateVectorsCommand.php b/app/Console/Commands/RegenerateVectorsCommand.php new file mode 100644 index 00000000000..700d05300d8 --- /dev/null +++ b/app/Console/Commands/RegenerateVectorsCommand.php @@ -0,0 +1,46 @@ +delete(); + + $types = $entityProvider->all(); + foreach ($types as $type => $typeInstance) { + $this->info("Creating jobs to store vectors for {$type} data..."); + /** @var Entity[] $entities */ + $typeInstance->newQuery()->chunkById(100, function ($entities) { + foreach ($entities as $entity) { + dispatch(new StoreEntityVectorsJob($entity)); + } + }); + } + } +} diff --git a/app/Search/SearchController.php b/app/Search/SearchController.php index 2fce6a3d53f..a688385e7c3 100644 --- a/app/Search/SearchController.php +++ b/app/Search/SearchController.php @@ -6,6 +6,7 @@ use BookStack\Entities\Queries\QueryPopular; use BookStack\Entities\Tools\SiblingFetcher; use BookStack\Http\Controller; +use BookStack\Search\Vectors\VectorSearchRunner; use Illuminate\Http\Request; class SearchController extends Controller @@ -139,4 +140,19 @@ public function searchSiblings(Request $request, SiblingFetcher $siblingFetcher) return view('entities.list-basic', ['entities' => $entities, 'style' => 'compact']); } + + public function searchQuery(Request $request, VectorSearchRunner $runner) + { + $query = $request->get('query', ''); + + if ($query) { + $results = $runner->run($query); + } else { + $results = null; + } + + return view('search.query', [ + 'results' => $results, + ]); + } } diff --git a/app/Search/SearchIndex.php b/app/Search/SearchIndex.php index 36f71f6ccc7..9b34fa04e28 100644 --- a/app/Search/SearchIndex.php +++ b/app/Search/SearchIndex.php @@ -6,6 +6,8 @@ use BookStack\Entities\EntityProvider; use BookStack\Entities\Models\Entity; use BookStack\Entities\Models\Page; +use BookStack\Search\Vectors\StoreEntityVectorsJob; +use BookStack\Search\Vectors\VectorQueryServiceProvider; use BookStack\Util\HtmlDocument; use DOMNode; use Illuminate\Database\Eloquent\Builder; @@ -25,7 +27,7 @@ class SearchIndex public static string $softDelimiters = ".-"; public function __construct( - protected EntityProvider $entityProvider + protected EntityProvider $entityProvider, ) { } @@ -37,6 +39,10 @@ public function indexEntity(Entity $entity): void $this->deleteEntityTerms($entity); $terms = $this->entityToTermDataArray($entity); $this->insertTerms($terms); + + if (VectorQueryServiceProvider::isEnabled()) { + dispatch(new StoreEntityVectorsJob($entity)); + } } /** @@ -47,9 +53,15 @@ public function indexEntity(Entity $entity): void public function indexEntities(array $entities): void { $terms = []; + $vectorQueryEnabled = VectorQueryServiceProvider::isEnabled(); + foreach ($entities as $entity) { $entityTerms = $this->entityToTermDataArray($entity); array_push($terms, ...$entityTerms); + + if ($vectorQueryEnabled) { + dispatch(new StoreEntityVectorsJob($entity)); + } } $this->insertTerms($terms); diff --git a/app/Search/Vectors/EntityVectorGenerator.php b/app/Search/Vectors/EntityVectorGenerator.php new file mode 100644 index 00000000000..9563694a321 --- /dev/null +++ b/app/Search/Vectors/EntityVectorGenerator.php @@ -0,0 +1,84 @@ +vectorQueryServiceProvider->get(); + + $text = $this->entityToPlainText($entity); + $chunks = $this->chunkText($text); + $embeddings = $this->chunksToEmbeddings($chunks, $vectorService); + + $this->deleteExistingEmbeddingsForEntity($entity); + $this->storeEmbeddings($embeddings, $chunks, $entity); + } + + protected function deleteExistingEmbeddingsForEntity(Entity $entity): void + { + SearchVector::query() + ->where('entity_type', '=', $entity->getMorphClass()) + ->where('entity_id', '=', $entity->id) + ->delete(); + } + + protected function storeEmbeddings(array $embeddings, array $textChunks, Entity $entity): void + { + $toInsert = []; + + foreach ($embeddings as $index => $embedding) { + $text = $textChunks[$index]; + $toInsert[] = [ + 'entity_id' => $entity->id, + 'entity_type' => $entity->getMorphClass(), + 'embedding' => DB::raw('VEC_FROMTEXT("[' . implode(',', $embedding) . ']")'), + 'text' => $text, + ]; + } + + // TODO - Chunk inserts + SearchVector::query()->insert($toInsert); + } + + /** + * @param string[] $chunks + * @return float[] array + */ + protected function chunksToEmbeddings(array $chunks, VectorQueryService $vectorQueryService): array + { + $embeddings = []; + foreach ($chunks as $index => $chunk) { + $embeddings[$index] = $vectorQueryService->generateEmbeddings($chunk); + } + return $embeddings; + } + + /** + * @return string[] + */ + protected function chunkText(string $text): array + { + // TODO - Join adjacent smaller chunks up + return array_filter(array_map(function (string $section): string { + return trim($section); + }, explode("\n", $text))); + } + + protected function entityToPlainText(Entity $entity): string + { + $text = $entity->name . "\n\n" . $entity->{$entity->textField}; + // TODO - Add tags + return $text; + } +} diff --git a/app/Search/Vectors/SearchVector.php b/app/Search/Vectors/SearchVector.php new file mode 100644 index 00000000000..4a5555f87d9 --- /dev/null +++ b/app/Search/Vectors/SearchVector.php @@ -0,0 +1,16 @@ +key = $this->options['key'] ?? ''; + $this->endpoint = $this->options['endpoint'] ?? ''; + $this->embeddingModel = $this->options['embedding_model'] ?? ''; + $this->queryModel = $this->options['query_model'] ?? ''; + } + + protected function jsonRequest(string $method, string $uri, array $data): array + { + $fullUrl = rtrim($this->endpoint, '/') . '/' . ltrim($uri, '/'); + $client = $this->http->buildClient(30); + $request = $this->http->jsonRequest($method, $fullUrl, $data) + ->withHeader('Authorization', 'Bearer ' . $this->key); + + $response = $client->sendRequest($request); + return json_decode($response->getBody()->getContents(), true); + } + + public function generateEmbeddings(string $text): array + { + $response = $this->jsonRequest('POST', 'v1/embeddings', [ + 'input' => $text, + 'model' => $this->embeddingModel, + ]); + + return $response['data'][0]['embedding']; + } + + public function query(string $input, array $context): string + { + $formattedContext = implode("\n", $context); + + $response = $this->jsonRequest('POST', 'v1/chat/completions', [ + 'model' => $this->queryModel, + 'messages' => [ + [ + 'role' => 'developer', + 'content' => 'You are a helpful assistant providing search query responses. Be specific, factual and to-the-point in response. Don\'t try to converse or continue the conversation.' + ], + [ + 'role' => 'user', + 'content' => "Provide a response to the below given QUERY using the below given CONTEXT. The CONTEXT is split into parts via lines. Ignore any nonsensical lines of CONTEXT.\nQUERY: {$input}\n\nCONTEXT: {$formattedContext}", + ] + ], + ]); + + return $response['choices'][0]['message']['content'] ?? ''; + } +} diff --git a/app/Search/Vectors/Services/VectorQueryService.php b/app/Search/Vectors/Services/VectorQueryService.php new file mode 100644 index 00000000000..746f95f5b22 --- /dev/null +++ b/app/Search/Vectors/Services/VectorQueryService.php @@ -0,0 +1,21 @@ +generateAndStore($this->entity); + } +} diff --git a/app/Search/Vectors/VectorQueryServiceProvider.php b/app/Search/Vectors/VectorQueryServiceProvider.php new file mode 100644 index 00000000000..eae7149d03c --- /dev/null +++ b/app/Search/Vectors/VectorQueryServiceProvider.php @@ -0,0 +1,36 @@ +getServiceName(); + + if ($service === 'openai') { + return new OpenAiVectorQueryService(config('services.openai'), $this->http); + } + + throw new \Exception("No '{$service}' LLM service found"); + } + + protected static function getServiceName(): string + { + return strtolower(config('services.llm')); + } + + public static function isEnabled(): bool + { + return !empty(static::getServiceName()); + } +} diff --git a/app/Search/Vectors/VectorSearchRunner.php b/app/Search/Vectors/VectorSearchRunner.php new file mode 100644 index 00000000000..53b1a4bd696 --- /dev/null +++ b/app/Search/Vectors/VectorSearchRunner.php @@ -0,0 +1,34 @@ +vectorQueryServiceProvider->get(); + $queryVector = $queryService->generateEmbeddings($query); + + // TODO - Apply permissions + // TODO - Join models + $topMatches = SearchVector::query()->select('text', 'entity_type', 'entity_id') + ->selectRaw('VEC_DISTANCE_COSINE(VEC_FROMTEXT("[' . implode(',', $queryVector) . ']"), embedding) as distance') + ->orderBy('distance', 'asc') + ->having('distance', '<', 0.6) + ->limit(10) + ->get(); + + $matchesText = array_values(array_map(fn (SearchVector $match) => $match->text, $topMatches->all())); + $llmResult = $queryService->query($query, $matchesText); + + return [ + 'llm_result' => $llmResult, + 'entity_matches' => $topMatches->toArray() + ]; + } +} diff --git a/database/migrations/2025_03_24_155748_create_search_vectors_table.php b/database/migrations/2025_03_24_155748_create_search_vectors_table.php new file mode 100644 index 00000000000..0ae67c2256f --- /dev/null +++ b/database/migrations/2025_03_24_155748_create_search_vectors_table.php @@ -0,0 +1,37 @@ +string('entity_type', 100); + $table->integer('entity_id'); + $table->text('text'); + + $table->index(['entity_type', 'entity_id']); + }); + + $table = DB::getTablePrefix() . 'search_vectors'; + + // TODO - Vector size might need to be dynamic + DB::statement("ALTER TABLE {$table} ADD COLUMN (embedding VECTOR(1536) NOT NULL)"); + DB::statement("ALTER TABLE {$table} ADD VECTOR INDEX (embedding) DISTANCE=cosine"); + } + + /** + * Reverse the migrations. + */ + public function down(): void + { + Schema::dropIfExists('search_vectors'); + } +}; diff --git a/resources/views/search/query.blade.php b/resources/views/search/query.blade.php new file mode 100644 index 00000000000..e8b4c84779c --- /dev/null +++ b/resources/views/search/query.blade.php @@ -0,0 +1,29 @@ +@extends('layouts.simple') + +@section('body') +
+ +
+ + +
+ + @if($results) +

Results

+ +

LLM Output

+

{{ $results['llm_result'] }}

+ +

Entity Matches

+ @foreach($results['entity_matches'] as $match) +
+
{{ $match['entity_type'] }}:{{ $match['entity_id'] }}; Distance: {{ $match['distance'] }}
+
+ match text +
{{ $match['text'] }}
+
+
+ @endforeach + @endif +
+@stop diff --git a/routes/web.php b/routes/web.php index 8184725834c..15fe6d69b2d 100644 --- a/routes/web.php +++ b/routes/web.php @@ -187,6 +187,7 @@ // Search Route::get('/search', [SearchController::class, 'search']); + Route::get('/search/query', [SearchController::class, 'searchQuery']); Route::get('/search/book/{bookId}', [SearchController::class, 'searchBook']); Route::get('/search/chapter/{bookId}', [SearchController::class, 'searchChapter']); Route::get('/search/entity/siblings', [SearchController::class, 'searchSiblings']);