Experimental BTree changes
This commit is contained in:
parent
a6b20df3f0
commit
4c78b7365a
@ -1,6 +1,7 @@
|
|||||||
#include "StarBTreeDatabase.hpp"
|
#include "StarBTreeDatabase.hpp"
|
||||||
#include "StarSha256.hpp"
|
#include "StarSha256.hpp"
|
||||||
#include "StarVlqEncoding.hpp"
|
#include "StarVlqEncoding.hpp"
|
||||||
|
#include "StarLogging.hpp"
|
||||||
|
|
||||||
namespace Star {
|
namespace Star {
|
||||||
|
|
||||||
@ -243,7 +244,7 @@ uint32_t BTreeDatabase::freeBlockCount() {
|
|||||||
indexBlockIndex = indexBlock.nextFreeBlock;
|
indexBlockIndex = indexBlock.nextFreeBlock;
|
||||||
}
|
}
|
||||||
|
|
||||||
count += m_availableBlocks.size() + m_pendingFree.size();
|
count += m_availableBlocks.size();
|
||||||
|
|
||||||
// Include untracked blocks at the end of the file in the free count.
|
// Include untracked blocks at the end of the file in the free count.
|
||||||
count += (m_device->size() - m_deviceSize) / m_blockSize;
|
count += (m_device->size() - m_deviceSize) / m_blockSize;
|
||||||
@ -272,7 +273,7 @@ uint32_t BTreeDatabase::leafBlockCount() {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
BTreeDatabase* parent;
|
BTreeDatabase* parent = nullptr;
|
||||||
BlockIndex leafBlockCount = 0;
|
BlockIndex leafBlockCount = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -293,8 +294,8 @@ void BTreeDatabase::rollback() {
|
|||||||
|
|
||||||
m_availableBlocks.clear();
|
m_availableBlocks.clear();
|
||||||
m_indexCache.clear();
|
m_indexCache.clear();
|
||||||
|
m_uncommittedWrites.clear();
|
||||||
m_uncommitted.clear();
|
m_uncommitted.clear();
|
||||||
m_pendingFree.clear();
|
|
||||||
|
|
||||||
readRoot();
|
readRoot();
|
||||||
|
|
||||||
@ -305,6 +306,7 @@ void BTreeDatabase::rollback() {
|
|||||||
void BTreeDatabase::close(bool closeDevice) {
|
void BTreeDatabase::close(bool closeDevice) {
|
||||||
WriteLocker writeLocker(m_lock);
|
WriteLocker writeLocker(m_lock);
|
||||||
if (m_open) {
|
if (m_open) {
|
||||||
|
if (!tryFlatten())
|
||||||
doCommit();
|
doCommit();
|
||||||
|
|
||||||
m_indexCache.clear();
|
m_indexCache.clear();
|
||||||
@ -896,17 +898,25 @@ void BTreeDatabase::rawReadBlock(BlockIndex blockIndex, size_t blockOffset, char
|
|||||||
if (size <= 0)
|
if (size <= 0)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
if (auto buffer = m_uncommittedWrites.ptr(blockIndex))
|
||||||
|
buffer->copyTo(block, blockOffset, size);
|
||||||
|
else
|
||||||
m_device->readFullAbsolute(HeaderSize + blockIndex * (StreamOffset)m_blockSize + blockOffset, block, size);
|
m_device->readFullAbsolute(HeaderSize + blockIndex * (StreamOffset)m_blockSize + blockOffset, block, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
void BTreeDatabase::rawWriteBlock(BlockIndex blockIndex, size_t blockOffset, char const* block, size_t size) const {
|
void BTreeDatabase::rawWriteBlock(BlockIndex blockIndex, size_t blockOffset, char const* block, size_t size) {
|
||||||
if (blockOffset > m_blockSize || size > m_blockSize - blockOffset)
|
if (blockOffset > m_blockSize || size > m_blockSize - blockOffset)
|
||||||
throw DBException::format("Write past end of block, offset: {} size {}", blockOffset, size);
|
throw DBException::format("Write past end of block, offset: {} size {}", blockOffset, size);
|
||||||
|
|
||||||
if (size <= 0)
|
if (size <= 0)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
m_device->writeFullAbsolute(HeaderSize + blockIndex * (StreamOffset)m_blockSize + blockOffset, block, size);
|
StreamOffset blockStart = HeaderSize + blockIndex * (StreamOffset)m_blockSize;
|
||||||
|
auto buffer = m_uncommittedWrites.find(blockIndex);
|
||||||
|
if (buffer == m_uncommittedWrites.end())
|
||||||
|
buffer = m_uncommittedWrites.emplace(blockIndex, m_device->readBytesAbsolute(blockStart, m_blockSize)).first;
|
||||||
|
|
||||||
|
buffer->second.writeFrom(block, blockOffset, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto BTreeDatabase::readFreeIndexBlock(BlockIndex blockIndex) -> FreeIndexBlock {
|
auto BTreeDatabase::readFreeIndexBlock(BlockIndex blockIndex) -> FreeIndexBlock {
|
||||||
@ -991,12 +1001,12 @@ auto BTreeDatabase::leafTailBlocks(BlockIndex leafPointer) -> List<BlockIndex> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void BTreeDatabase::freeBlock(BlockIndex b) {
|
void BTreeDatabase::freeBlock(BlockIndex b) {
|
||||||
if (m_uncommitted.contains(b)) {
|
if (m_uncommitted.contains(b))
|
||||||
m_uncommitted.remove(b);
|
m_uncommitted.remove(b);
|
||||||
|
if (m_uncommittedWrites.contains(b))
|
||||||
|
m_uncommittedWrites.remove(b);
|
||||||
|
|
||||||
m_availableBlocks.add(b);
|
m_availableBlocks.add(b);
|
||||||
} else {
|
|
||||||
m_pendingFree.append(b);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
auto BTreeDatabase::reserveBlock() -> BlockIndex {
|
auto BTreeDatabase::reserveBlock() -> BlockIndex {
|
||||||
@ -1007,10 +1017,7 @@ auto BTreeDatabase::reserveBlock() -> BlockIndex {
|
|||||||
FreeIndexBlock indexBlock = readFreeIndexBlock(m_headFreeIndexBlock);
|
FreeIndexBlock indexBlock = readFreeIndexBlock(m_headFreeIndexBlock);
|
||||||
for (auto const& b : indexBlock.freeBlocks)
|
for (auto const& b : indexBlock.freeBlocks)
|
||||||
m_availableBlocks.add(b);
|
m_availableBlocks.add(b);
|
||||||
// We cannot make available the block itself, because we must maintain
|
m_availableBlocks.add(m_headFreeIndexBlock);
|
||||||
// atomic consistency. We will need to free this block later and commit
|
|
||||||
// the new free index block chain.
|
|
||||||
m_pendingFree.append(m_headFreeIndexBlock);
|
|
||||||
m_headFreeIndexBlock = indexBlock.nextFreeBlock;
|
m_headFreeIndexBlock = indexBlock.nextFreeBlock;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1068,65 +1075,168 @@ void BTreeDatabase::readRoot() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void BTreeDatabase::doCommit() {
|
void BTreeDatabase::doCommit() {
|
||||||
if (m_availableBlocks.empty() && m_pendingFree.empty() && m_uncommitted.empty())
|
if (m_availableBlocks.empty() && m_uncommitted.empty())
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (!m_availableBlocks.empty() || !m_pendingFree.empty()) {
|
if (!m_availableBlocks.empty()) {
|
||||||
// First, read the existing head FreeIndexBlock, if it exists
|
// First, read the existing head FreeIndexBlock, if it exists
|
||||||
FreeIndexBlock indexBlock = FreeIndexBlock{InvalidBlockIndex, {}};
|
FreeIndexBlock indexBlock = FreeIndexBlock{InvalidBlockIndex, {}};
|
||||||
if (m_headFreeIndexBlock != InvalidBlockIndex) {
|
|
||||||
indexBlock = readFreeIndexBlock(m_headFreeIndexBlock);
|
|
||||||
if (indexBlock.freeBlocks.size() >= maxFreeIndexLength()) {
|
|
||||||
// If the existing head free index block is full, then we should start a
|
|
||||||
// new one and leave it alone
|
|
||||||
indexBlock.nextFreeBlock = m_headFreeIndexBlock;
|
|
||||||
indexBlock.freeBlocks.clear();
|
|
||||||
} else {
|
|
||||||
// If we are copying an existing free index block, the old free index
|
|
||||||
// block will be a newly freed block
|
|
||||||
indexBlock.freeBlocks.append(m_headFreeIndexBlock);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Then, we need to write all the available blocks, which are safe to write
|
auto newBlock = [&]() -> BlockIndex {
|
||||||
// to, and the pending free blocks, which are NOT safe to write to, to the
|
if (!m_availableBlocks.empty())
|
||||||
// FreeIndexBlock chain.
|
return m_availableBlocks.takeFirst();
|
||||||
while (true) {
|
|
||||||
if (indexBlock.freeBlocks.size() < maxFreeIndexLength() && (!m_availableBlocks.empty() || !m_pendingFree.empty())) {
|
|
||||||
// If we have room on our current FreeIndexblock, just add a block to
|
|
||||||
// it. Prioritize the pending free blocks, because we cannot use those
|
|
||||||
// to write to.
|
|
||||||
BlockIndex toAdd;
|
|
||||||
if (m_pendingFree.empty())
|
|
||||||
toAdd = m_availableBlocks.takeFirst();
|
|
||||||
else
|
else
|
||||||
toAdd = m_pendingFree.takeFirst();
|
return makeEndBlock();
|
||||||
|
};
|
||||||
|
|
||||||
|
if (m_headFreeIndexBlock != InvalidBlockIndex)
|
||||||
|
indexBlock = readFreeIndexBlock(m_headFreeIndexBlock);
|
||||||
|
else
|
||||||
|
m_headFreeIndexBlock = newBlock();
|
||||||
|
|
||||||
|
// Then, we need to write all the available blocks to the FreeIndexBlock chain.
|
||||||
|
while (true) {
|
||||||
|
// If we have room on our current FreeIndexBlock, just add a block to it.
|
||||||
|
if (!m_availableBlocks.empty() && indexBlock.freeBlocks.size() < maxFreeIndexLength()) {
|
||||||
|
BlockIndex toAdd = m_availableBlocks.takeFirst();
|
||||||
indexBlock.freeBlocks.append(toAdd);
|
indexBlock.freeBlocks.append(toAdd);
|
||||||
} else {
|
} else {
|
||||||
// If our index block is full OR we are out of blocks to free, then
|
// Update the current head free index block.
|
||||||
// need to write a new head free index block.
|
|
||||||
if (m_availableBlocks.empty())
|
|
||||||
m_headFreeIndexBlock = makeEndBlock();
|
|
||||||
else
|
|
||||||
m_headFreeIndexBlock = m_availableBlocks.takeFirst();
|
|
||||||
writeFreeIndexBlock(m_headFreeIndexBlock, indexBlock);
|
writeFreeIndexBlock(m_headFreeIndexBlock, indexBlock);
|
||||||
|
|
||||||
// If we're out of blocks to free, then we're done
|
// If we're out of blocks to free, then we're done
|
||||||
if (m_availableBlocks.empty() && m_pendingFree.empty())
|
if (m_availableBlocks.empty())
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
// If our head free index block is full, then
|
||||||
|
// need to write a new head free index block.
|
||||||
|
if (indexBlock.freeBlocks.size() >= maxFreeIndexLength()) {
|
||||||
indexBlock.nextFreeBlock = m_headFreeIndexBlock;
|
indexBlock.nextFreeBlock = m_headFreeIndexBlock;
|
||||||
indexBlock.freeBlocks.clear();
|
indexBlock.freeBlocks.clear();
|
||||||
|
|
||||||
|
m_headFreeIndexBlock = newBlock();
|
||||||
|
writeFreeIndexBlock(m_headFreeIndexBlock, indexBlock);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
commitWrites();
|
||||||
writeRoot();
|
writeRoot();
|
||||||
|
|
||||||
m_uncommitted.clear();
|
m_uncommitted.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void BTreeDatabase::commitWrites() {
|
||||||
|
for (auto& write : m_uncommittedWrites)
|
||||||
|
m_device->writeFullAbsolute(HeaderSize + write.first * (StreamOffset)m_blockSize, write.second.ptr(), m_blockSize);
|
||||||
|
|
||||||
|
m_device->sync();
|
||||||
|
m_uncommittedWrites.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool BTreeDatabase::tryFlatten() {
|
||||||
|
if (m_headFreeIndexBlock == InvalidBlockIndex || m_rootIsLeaf || !m_device->isWritable())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
BlockIndex freeBlockCount = 0;
|
||||||
|
BlockIndex indexBlockIndex = m_headFreeIndexBlock;
|
||||||
|
while (indexBlockIndex != InvalidBlockIndex) {
|
||||||
|
FreeIndexBlock indexBlock = readFreeIndexBlock(indexBlockIndex);
|
||||||
|
freeBlockCount += 1 + indexBlock.freeBlocks.size();
|
||||||
|
indexBlockIndex = indexBlock.nextFreeBlock;
|
||||||
|
}
|
||||||
|
|
||||||
|
BlockIndex expectedBlockCount = (m_deviceSize - HeaderSize) / m_blockSize;
|
||||||
|
float free = float(freeBlockCount) / float(expectedBlockCount);
|
||||||
|
if (free < 0.05f)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
Logger::info("[BTreeDatabase] File '{}' is {:.2f}% free space, flattening", m_device->deviceName(), free * 100.f);
|
||||||
|
|
||||||
|
indexBlockIndex = m_headFreeIndexBlock;
|
||||||
|
{
|
||||||
|
List<BlockIndex> availableBlocksList;
|
||||||
|
do {
|
||||||
|
FreeIndexBlock indexBlock = readFreeIndexBlock(indexBlockIndex);
|
||||||
|
availableBlocksList.appendAll(indexBlock.freeBlocks);
|
||||||
|
availableBlocksList.append(indexBlockIndex);
|
||||||
|
indexBlockIndex = indexBlock.nextFreeBlock;
|
||||||
|
} while (indexBlockIndex != InvalidBlockIndex);
|
||||||
|
m_headFreeIndexBlock = InvalidBlockIndex;
|
||||||
|
|
||||||
|
sort(availableBlocksList);
|
||||||
|
for (auto& availableBlock : availableBlocksList)
|
||||||
|
m_availableBlocks.insert(m_availableBlocks.end(), availableBlock);
|
||||||
|
}
|
||||||
|
|
||||||
|
BlockIndex count = 1; // 1 to include root index
|
||||||
|
|
||||||
|
double start = Time::monotonicTime();
|
||||||
|
auto index = m_impl.loadIndex(m_impl.rootPointer());
|
||||||
|
if (flattenVisitor(index, count)) {
|
||||||
|
m_impl.deleteIndex(index);
|
||||||
|
index->self = InvalidBlockIndex;
|
||||||
|
m_root = m_impl.storeIndex(index);
|
||||||
|
}
|
||||||
|
|
||||||
|
m_availableBlocks.clear();
|
||||||
|
m_device->resize(m_deviceSize = HeaderSize + (StreamOffset)m_blockSize * count);
|
||||||
|
|
||||||
|
m_indexCache.clear();
|
||||||
|
commitWrites();
|
||||||
|
writeRoot();
|
||||||
|
m_uncommitted.clear();
|
||||||
|
|
||||||
|
Logger::info("[BTreeDatabase] Finished flattening '{}' in {:.2f} milliseconds", m_device->deviceName(), (Time::monotonicTime() - start) * 1000.f);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool BTreeDatabase::flattenVisitor(BTreeImpl::Index& index, BlockIndex& count) {
|
||||||
|
auto pointerCount = index->pointerCount();
|
||||||
|
count += pointerCount;
|
||||||
|
bool canStore = !m_availableBlocks.empty();
|
||||||
|
|
||||||
|
bool needsStore = false;
|
||||||
|
if (m_impl.indexLevel(index) == 0) {
|
||||||
|
for (size_t i = 0; i != pointerCount; ++i) {
|
||||||
|
auto indexPointer = index->pointer(i);
|
||||||
|
auto tailBlocks = leafTailBlocks(indexPointer);
|
||||||
|
if (canStore) {
|
||||||
|
bool leafNeedsStore = m_availableBlocks.first() < indexPointer;
|
||||||
|
|
||||||
|
if (!leafNeedsStore)
|
||||||
|
for (size_t i = 0; !leafNeedsStore && i != tailBlocks.size(); ++i)
|
||||||
|
if (m_availableBlocks.first() < tailBlocks[i])
|
||||||
|
leafNeedsStore = true;
|
||||||
|
|
||||||
|
if (leafNeedsStore) {
|
||||||
|
auto leaf = m_impl.loadLeaf(indexPointer);
|
||||||
|
m_impl.deleteLeaf(leaf);
|
||||||
|
leaf->self = InvalidBlockIndex;
|
||||||
|
index->updatePointer(i, m_impl.storeLeaf(leaf));
|
||||||
|
tailBlocks = leafTailBlocks(leaf->self);
|
||||||
|
needsStore = true;
|
||||||
|
}
|
||||||
|
canStore = !m_availableBlocks.empty();
|
||||||
|
}
|
||||||
|
count += tailBlocks.size();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (size_t i = 0; i != pointerCount; ++i) {
|
||||||
|
auto childIndex = m_impl.loadIndex(index->pointer(i));
|
||||||
|
if (canStore && flattenVisitor(childIndex, count)) {
|
||||||
|
m_impl.deleteIndex(childIndex);
|
||||||
|
childIndex->self = InvalidBlockIndex;
|
||||||
|
index->updatePointer(i, m_impl.storeIndex(childIndex));
|
||||||
|
canStore = !m_availableBlocks.empty();
|
||||||
|
needsStore = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return needsStore || (canStore && m_availableBlocks.first() < index->self);
|
||||||
|
}
|
||||||
|
|
||||||
void BTreeDatabase::checkIfOpen(char const* methodName, bool shouldBeOpen) const {
|
void BTreeDatabase::checkIfOpen(char const* methodName, bool shouldBeOpen) const {
|
||||||
if (shouldBeOpen && !m_open)
|
if (shouldBeOpen && !m_open)
|
||||||
throw DBException::format("BTreeDatabase method '{}' called when not open, must be open.", methodName);
|
throw DBException::format("BTreeDatabase method '{}' called when not open, must be open.", methodName);
|
||||||
@ -1146,7 +1256,7 @@ void BTreeDatabase::checkKeySize(ByteArray const& k) const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
uint32_t BTreeDatabase::maxFreeIndexLength() const {
|
uint32_t BTreeDatabase::maxFreeIndexLength() const {
|
||||||
return (m_blockSize - 2 - sizeof(BlockIndex) - 4) / sizeof(BlockIndex);
|
return (m_blockSize / sizeof(BlockIndex)) - 2 - sizeof(BlockIndex) - 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
BTreeSha256Database::BTreeSha256Database() {
|
BTreeSha256Database::BTreeSha256Database() {
|
||||||
|
@ -230,7 +230,7 @@ private:
|
|||||||
void updateBlock(BlockIndex blockIndex, ByteArray const& block);
|
void updateBlock(BlockIndex blockIndex, ByteArray const& block);
|
||||||
|
|
||||||
void rawReadBlock(BlockIndex blockIndex, size_t blockOffset, char* block, size_t size) const;
|
void rawReadBlock(BlockIndex blockIndex, size_t blockOffset, char* block, size_t size) const;
|
||||||
void rawWriteBlock(BlockIndex blockIndex, size_t blockOffset, char const* block, size_t size) const;
|
void rawWriteBlock(BlockIndex blockIndex, size_t blockOffset, char const* block, size_t size);
|
||||||
|
|
||||||
void updateHeadFreeIndexBlock(BlockIndex newHead);
|
void updateHeadFreeIndexBlock(BlockIndex newHead);
|
||||||
|
|
||||||
@ -251,6 +251,9 @@ private:
|
|||||||
void writeRoot();
|
void writeRoot();
|
||||||
void readRoot();
|
void readRoot();
|
||||||
void doCommit();
|
void doCommit();
|
||||||
|
void commitWrites();
|
||||||
|
bool tryFlatten();
|
||||||
|
bool flattenVisitor(BTreeImpl::Index& index, BlockIndex& count);
|
||||||
|
|
||||||
void checkIfOpen(char const* methodName, bool shouldBeOpen) const;
|
void checkIfOpen(char const* methodName, bool shouldBeOpen) const;
|
||||||
void checkBlockIndex(size_t blockIndex) const;
|
void checkBlockIndex(size_t blockIndex) const;
|
||||||
@ -285,14 +288,14 @@ private:
|
|||||||
bool m_dirty;
|
bool m_dirty;
|
||||||
|
|
||||||
// Blocks that can be freely allocated and written to without violating
|
// Blocks that can be freely allocated and written to without violating
|
||||||
// atomic consistency
|
// atomic consistency.
|
||||||
Set<BlockIndex> m_availableBlocks;
|
Set<BlockIndex> m_availableBlocks;
|
||||||
|
|
||||||
// Blocks to be freed on next commit.
|
|
||||||
Deque<BlockIndex> m_pendingFree;
|
|
||||||
|
|
||||||
// Blocks that have been written in uncommitted portions of the tree.
|
// Blocks that have been written in uncommitted portions of the tree.
|
||||||
Set<BlockIndex> m_uncommitted;
|
Set<BlockIndex> m_uncommitted;
|
||||||
|
|
||||||
|
// Temporarily holds written data so that it can be rolled back.
|
||||||
|
mutable Map<BlockIndex, ByteArray> m_uncommittedWrites;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Version of BTreeDatabase that hashes keys with SHA-256 to produce a unique
|
// Version of BTreeDatabase that hashes keys with SHA-256 to produce a unique
|
||||||
|
Loading…
x
Reference in New Issue
Block a user