mirror of
https://github.com/toeverything/AFFiNE.git
synced 2026-02-13 21:05:19 +00:00
feat(server): context awareness for copilot (#9611)
fix PD-2167 fix PD-2169 fix PD-2190
This commit is contained in:
@@ -21,26 +21,26 @@ pub trait TextSplitter: Send + Sync {
|
||||
|
||||
fn split_documents(&self, documents: &[Document]) -> Result<Vec<Document>, TextSplitterError> {
|
||||
let mut texts: Vec<String> = Vec::new();
|
||||
let mut metadatas: Vec<HashMap<String, Value>> = Vec::new();
|
||||
let mut metadata: Vec<HashMap<String, Value>> = Vec::new();
|
||||
documents.iter().for_each(|d| {
|
||||
texts.push(d.page_content.clone());
|
||||
metadatas.push(d.metadata.clone());
|
||||
metadata.push(d.metadata.clone());
|
||||
});
|
||||
|
||||
self.create_documents(&texts, &metadatas)
|
||||
self.create_documents(&texts, &metadata)
|
||||
}
|
||||
|
||||
fn create_documents(
|
||||
&self,
|
||||
text: &[String],
|
||||
metadatas: &[HashMap<String, Value>],
|
||||
metadata: &[HashMap<String, Value>],
|
||||
) -> Result<Vec<Document>, TextSplitterError> {
|
||||
let mut metadatas = metadatas.to_vec();
|
||||
if metadatas.is_empty() {
|
||||
metadatas = vec![HashMap::new(); text.len()];
|
||||
let mut metadata = metadata.to_vec();
|
||||
if metadata.is_empty() {
|
||||
metadata = vec![HashMap::new(); text.len()];
|
||||
}
|
||||
|
||||
if text.len() != metadatas.len() {
|
||||
if text.len() != metadata.len() {
|
||||
return Err(TextSplitterError::MetadataTextMismatch);
|
||||
}
|
||||
|
||||
@@ -48,7 +48,7 @@ pub trait TextSplitter: Send + Sync {
|
||||
for i in 0..text.len() {
|
||||
let chunks = self.split_text(&text[i])?;
|
||||
for chunk in chunks {
|
||||
let document = Document::new(chunk).with_metadata(metadatas[i].clone());
|
||||
let document = Document::new(chunk).with_metadata(metadata[i].clone());
|
||||
documents.push(document);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,11 +25,11 @@ impl Default for SplitterOptions {
|
||||
impl SplitterOptions {
|
||||
pub fn new() -> Self {
|
||||
SplitterOptions {
|
||||
chunk_size: 512,
|
||||
chunk_overlap: 0,
|
||||
chunk_size: 7168,
|
||||
chunk_overlap: 128,
|
||||
model_name: String::from("gpt-3.5-turbo"),
|
||||
encoding_name: String::from("cl100k_base"),
|
||||
trim_chunks: false,
|
||||
trim_chunks: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -63,6 +63,7 @@ impl SplitterOptions {
|
||||
|
||||
pub fn get_tokenizer_from_str(s: &str) -> Option<Tokenizer> {
|
||||
match s.to_lowercase().as_str() {
|
||||
"o200k_base" => Some(Tokenizer::O200kBase),
|
||||
"cl100k_base" => Some(Tokenizer::Cl100kBase),
|
||||
"p50k_base" => Some(Tokenizer::P50kBase),
|
||||
"r50k_base" => Some(Tokenizer::R50kBase),
|
||||
|
||||
Reference in New Issue
Block a user