From 3d547baad303bf255ceee3271ce061ffbfec2de5 Mon Sep 17 00:00:00 2001 From: Thorsten Klein Date: Wed, 19 Jun 2024 14:50:29 +0200 Subject: [PATCH] add: in-house csv loader --- pkg/datastore/documentloader/csv.go | 163 ++++++++++++++++++++++++++++ 1 file changed, 163 insertions(+) create mode 100644 pkg/datastore/documentloader/csv.go diff --git a/pkg/datastore/documentloader/csv.go b/pkg/datastore/documentloader/csv.go new file mode 100644 index 00000000..ba3e4928 --- /dev/null +++ b/pkg/datastore/documentloader/csv.go @@ -0,0 +1,163 @@ +package documentloader + +import ( + "context" + "encoding/csv" + "fmt" + "github.com/gptscript-ai/knowledge/pkg/datastore/types" + vs "github.com/gptscript-ai/knowledge/pkg/vectorstore" + "io" + "slices" + "strings" +) + +// Compile time check to ensure CSV satisfies the DocumentLoader interface. +var _ types.DocumentLoader = (*CSV)(nil) + +type CSVDocumentFormat string + +const ( + // OriginalFormat represents the original format of the CSV document. + OriginalFormat CSVDocumentFormat = "original" + + // JSONFormat represents the JSON format of the CSV document. + JSONFormat CSVDocumentFormat = "json" + + // MarkdownFormat represents the Markdown format of the CSV document. + MarkdownFormat CSVDocumentFormat = "markdown" +) + +// CSVOptions contains options for configuring the CSV loader. +type CSVOptions struct { + // Separator is the rune used to separate fields in the CSV file. + Separator rune + + // LazyQuotes controls whether the CSV reader should use lazy quotes mode. + LazyQuotes bool + + // Columns is a list of column names to filter and include in the loaded documents. + Columns []string + + // ConcatRows controls whether to concatenate rows into a single document. + ConcatRows bool + + // RowSeparator is the string used to separate rows in the concatenated document. Default is "\n". + RowSeparator string + + // MaxConcatRows is the maximum number of rows to concatenate into a single document. + MaxConcatRows int + + // Format is the format in which the documents will be stored. Default is "original". + Format CSVDocumentFormat +} + +// CSV represents a CSV document loader. +type CSV struct { + r io.Reader + opts CSVOptions +} + +// NewCSV creates a new CSV loader with an io.Reader and optional configuration options. +// It returns a pointer to the created CSV loader. +func NewCSV(r io.Reader, optFns ...func(o *CSVOptions)) *CSV { + opts := CSVOptions{ + Separator: ',', + LazyQuotes: false, + ConcatRows: false, + MaxConcatRows: 100, + RowSeparator: "\n", + Format: OriginalFormat, + } + + for _, fn := range optFns { + fn(&opts) + } + + return &CSV{ + r: r, + opts: opts, + } +} + +// Load loads CSV documents from the provided reader. +func (l *CSV) Load(ctx context.Context) ([]vs.Document, error) { + var ( + header []string + docs []vs.Document + rown uint + + err error + + docContent []string // content of a single document + ) + + reader := csv.NewReader(l.r) + reader.Comma = l.opts.Separator + reader.LazyQuotes = l.opts.LazyQuotes + + // Read header + header, err = reader.Read() + if err != nil { + if err == io.EOF { + return docs, nil + } + return nil, err + } + + for { + row, err := reader.Read() + if err == io.EOF { + break + } + + if err != nil { + return nil, err + } + + var content []string + + // Transposed Markdown format + if l.opts.Format == MarkdownFormat { + for i, value := range row { + if len(l.opts.Columns) > 0 && !slices.Contains(l.opts.Columns, header[i]) { + continue + } + + line := fmt.Sprintf("%s: %s", header[i], value) + content = append(content, line) + } + } + + // Original format + if l.opts.Format == OriginalFormat { + + } + + rown++ + + // We're not concatenating, so just append to the result slice and continue + if !l.opts.ConcatRows { + doc := vs.Document{ + Content: strings.Join(content, "\n"), + Metadata: map[string]any{"row": rown}, + } + docs = append(docs, doc) + continue + } + + // Concatenating rows + + } + + return docs, nil +} + +// LoadAndSplit loads CSV documents from the provided reader and splits them using the specified text splitter. +func (l *CSV) LoadAndSplit(ctx context.Context, splitter types.TextSplitter) ([]vs.Document, error) { + docs, err := l.Load(ctx) + if err != nil { + return nil, err + } + + return splitter.SplitDocuments(docs) +}