feat(server): context awareness for copilot (#9611)

fix PD-2167
fix PD-2169
fix PD-2190
This commit is contained in:
darkskygit
2025-03-13 11:44:55 +00:00
parent 05f3069efd
commit d8373f66e7
51 changed files with 2101 additions and 294 deletions

View File

@@ -21,26 +21,26 @@ pub trait TextSplitter: Send + Sync {
fn split_documents(&self, documents: &[Document]) -> Result<Vec<Document>, TextSplitterError> {
let mut texts: Vec<String> = Vec::new();
let mut metadatas: Vec<HashMap<String, Value>> = Vec::new();
let mut metadata: Vec<HashMap<String, Value>> = Vec::new();
documents.iter().for_each(|d| {
texts.push(d.page_content.clone());
metadatas.push(d.metadata.clone());
metadata.push(d.metadata.clone());
});
self.create_documents(&texts, &metadatas)
self.create_documents(&texts, &metadata)
}
fn create_documents(
&self,
text: &[String],
metadatas: &[HashMap<String, Value>],
metadata: &[HashMap<String, Value>],
) -> Result<Vec<Document>, TextSplitterError> {
let mut metadatas = metadatas.to_vec();
if metadatas.is_empty() {
metadatas = vec![HashMap::new(); text.len()];
let mut metadata = metadata.to_vec();
if metadata.is_empty() {
metadata = vec![HashMap::new(); text.len()];
}
if text.len() != metadatas.len() {
if text.len() != metadata.len() {
return Err(TextSplitterError::MetadataTextMismatch);
}
@@ -48,7 +48,7 @@ pub trait TextSplitter: Send + Sync {
for i in 0..text.len() {
let chunks = self.split_text(&text[i])?;
for chunk in chunks {
let document = Document::new(chunk).with_metadata(metadatas[i].clone());
let document = Document::new(chunk).with_metadata(metadata[i].clone());
documents.push(document);
}
}

View File

@@ -25,11 +25,11 @@ impl Default for SplitterOptions {
impl SplitterOptions {
pub fn new() -> Self {
SplitterOptions {
chunk_size: 512,
chunk_overlap: 0,
chunk_size: 7168,
chunk_overlap: 128,
model_name: String::from("gpt-3.5-turbo"),
encoding_name: String::from("cl100k_base"),
trim_chunks: false,
trim_chunks: true,
}
}
}
@@ -63,6 +63,7 @@ impl SplitterOptions {
pub fn get_tokenizer_from_str(s: &str) -> Option<Tokenizer> {
match s.to_lowercase().as_str() {
"o200k_base" => Some(Tokenizer::O200kBase),
"cl100k_base" => Some(Tokenizer::Cl100kBase),
"p50k_base" => Some(Tokenizer::P50kBase),
"r50k_base" => Some(Tokenizer::R50kBase),