1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
|
use crate::error::SpandxResult;
use camino::{Utf8Path, Utf8PathBuf};
use std::collections::BTreeMap;
use tokio::fs::{File, OpenOptions};
use tokio::io::{AsyncReadExt, AsyncWriteExt};
use tracing::debug;
/// Binary index file for fast lookups in data files
#[derive(Debug)]
pub struct IndexFile {
path: Utf8PathBuf,
entries: BTreeMap<String, u64>, // key -> offset
is_dirty: bool,
}
impl IndexFile {
pub async fn create<P: AsRef<Utf8Path>>(path: P) -> SpandxResult<Self> {
let path = path.as_ref().to_path_buf();
// Ensure parent directory exists
if let Some(parent) = path.parent() {
tokio::fs::create_dir_all(parent).await?;
}
Ok(Self {
path,
entries: BTreeMap::new(),
is_dirty: false,
})
}
pub async fn open<P: AsRef<Utf8Path>>(path: P) -> SpandxResult<Self> {
let path = path.as_ref().to_path_buf();
let mut entries = BTreeMap::new();
if path.exists() {
let mut file = File::open(&path).await?;
// Read the number of entries (4 bytes, little-endian)
let mut count_bytes = [0u8; 4];
file.read_exact(&mut count_bytes).await?;
let entry_count = u32::from_le_bytes(count_bytes) as usize;
debug!("Loading index file {} with {} entries", path, entry_count);
// Read each entry: key_length (4 bytes) + key + offset (8 bytes)
for _ in 0..entry_count {
// Read key length
let mut key_len_bytes = [0u8; 4];
file.read_exact(&mut key_len_bytes).await?;
let key_len = u32::from_le_bytes(key_len_bytes) as usize;
// Read key
let mut key_bytes = vec![0u8; key_len];
file.read_exact(&mut key_bytes).await?;
let key = String::from_utf8(key_bytes)?;
// Read offset
let mut offset_bytes = [0u8; 8];
file.read_exact(&mut offset_bytes).await?;
let offset = u64::from_le_bytes(offset_bytes);
entries.insert(key, offset);
}
}
Ok(Self {
path,
entries,
is_dirty: false,
})
}
pub async fn add_entry(&mut self, key: &str, offset: u64) -> SpandxResult<()> {
self.entries.insert(key.to_string(), offset);
self.is_dirty = true;
Ok(())
}
pub async fn find_offset(&self, key: &str) -> SpandxResult<Option<u64>> {
Ok(self.entries.get(key).copied())
}
pub async fn finalize(&mut self) -> SpandxResult<()> {
if !self.is_dirty {
return Ok(());
}
let mut file = OpenOptions::new()
.create(true)
.write(true)
.truncate(true)
.open(&self.path)
.await?;
// Write number of entries
let entry_count = self.entries.len() as u32;
file.write_all(&entry_count.to_le_bytes()).await?;
// Write each entry
for (key, offset) in &self.entries {
// Write key length
let key_bytes = key.as_bytes();
let key_len = key_bytes.len() as u32;
file.write_all(&key_len.to_le_bytes()).await?;
// Write key
file.write_all(key_bytes).await?;
// Write offset
file.write_all(&offset.to_le_bytes()).await?;
}
file.flush().await?;
self.is_dirty = false;
debug!("Finalized index file {} with {} entries", self.path, self.entries.len());
Ok(())
}
pub fn len(&self) -> usize {
self.entries.len()
}
pub fn is_empty(&self) -> bool {
self.entries.is_empty()
}
pub fn keys(&self) -> impl Iterator<Item = &String> {
self.entries.keys()
}
pub fn path(&self) -> &Utf8Path {
&self.path
}
/// Get range of keys for binary search optimization
pub fn key_range(&self) -> Option<(&str, &str)> {
if self.entries.is_empty() {
None
} else {
let first_key = self.entries.keys().next().unwrap();
let last_key = self.entries.keys().last().unwrap();
Some((first_key, last_key))
}
}
/// Find all keys with a given prefix
pub fn find_keys_with_prefix(&self, prefix: &str) -> Vec<&String> {
self.entries
.keys()
.filter(|key| key.starts_with(prefix))
.collect()
}
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::TempDir;
#[tokio::test]
async fn test_index_file_create_and_add() {
let temp_dir = TempDir::new().unwrap();
let file_path = Utf8PathBuf::from_path_buf(temp_dir.path().join("test.idx")).unwrap();
let mut index_file = IndexFile::create(&file_path).await.unwrap();
index_file.add_entry("rails:7.0.0", 0).await.unwrap();
index_file.add_entry("sinatra:2.0.0", 42).await.unwrap();
index_file.add_entry("rack:2.0.0", 100).await.unwrap();
assert_eq!(index_file.len(), 3);
assert!(!index_file.is_empty());
let offset = index_file.find_offset("sinatra:2.0.0").await.unwrap();
assert_eq!(offset, Some(42));
let no_offset = index_file.find_offset("unknown:1.0.0").await.unwrap();
assert_eq!(no_offset, None);
}
#[tokio::test]
async fn test_index_file_finalize_and_reload() {
let temp_dir = TempDir::new().unwrap();
let file_path = Utf8PathBuf::from_path_buf(temp_dir.path().join("test.idx")).unwrap();
// Create and populate index
{
let mut index_file = IndexFile::create(&file_path).await.unwrap();
index_file.add_entry("rails:7.0.0", 0).await.unwrap();
index_file.add_entry("sinatra:2.0.0", 42).await.unwrap();
index_file.add_entry("rack:2.0.0", 100).await.unwrap();
index_file.finalize().await.unwrap();
}
// Reload and verify
{
let index_file = IndexFile::open(&file_path).await.unwrap();
assert_eq!(index_file.len(), 3);
let offset = index_file.find_offset("sinatra:2.0.0").await.unwrap();
assert_eq!(offset, Some(42));
let offset = index_file.find_offset("rack:2.0.0").await.unwrap();
assert_eq!(offset, Some(100));
}
}
#[tokio::test]
async fn test_index_file_sorted_order() {
let temp_dir = TempDir::new().unwrap();
let file_path = Utf8PathBuf::from_path_buf(temp_dir.path().join("test.idx")).unwrap();
let mut index_file = IndexFile::create(&file_path).await.unwrap();
// Add entries in non-alphabetical order
index_file.add_entry("zebra:1.0.0", 200).await.unwrap();
index_file.add_entry("apple:1.0.0", 0).await.unwrap();
index_file.add_entry("banana:1.0.0", 100).await.unwrap();
let keys: Vec<&String> = index_file.keys().collect();
assert_eq!(keys, vec!["apple:1.0.0", "banana:1.0.0", "zebra:1.0.0"]);
let range = index_file.key_range().unwrap();
assert_eq!(range, ("apple:1.0.0", "zebra:1.0.0"));
}
#[tokio::test]
async fn test_index_file_prefix_search() {
let temp_dir = TempDir::new().unwrap();
let file_path = Utf8PathBuf::from_path_buf(temp_dir.path().join("test.idx")).unwrap();
let mut index_file = IndexFile::create(&file_path).await.unwrap();
index_file.add_entry("rails:6.0.0", 0).await.unwrap();
index_file.add_entry("rails:7.0.0", 50).await.unwrap();
index_file.add_entry("rake:13.0.0", 100).await.unwrap();
index_file.add_entry("sinatra:2.0.0", 150).await.unwrap();
let rails_keys = index_file.find_keys_with_prefix("rails:");
assert_eq!(rails_keys.len(), 2);
assert!(rails_keys.contains(&&"rails:6.0.0".to_string()));
assert!(rails_keys.contains(&&"rails:7.0.0".to_string()));
let sinatra_keys = index_file.find_keys_with_prefix("sinatra:");
assert_eq!(sinatra_keys.len(), 1);
assert!(sinatra_keys.contains(&&"sinatra:2.0.0".to_string()));
let unknown_keys = index_file.find_keys_with_prefix("unknown:");
assert_eq!(unknown_keys.len(), 0);
}
#[tokio::test]
async fn test_index_file_empty() {
let temp_dir = TempDir::new().unwrap();
let file_path = Utf8PathBuf::from_path_buf(temp_dir.path().join("empty.idx")).unwrap();
let index_file = IndexFile::create(&file_path).await.unwrap();
assert!(index_file.is_empty());
assert_eq!(index_file.len(), 0);
assert!(index_file.key_range().is_none());
let no_offset = index_file.find_offset("anything").await.unwrap();
assert_eq!(no_offset, None);
}
}
|