Compact serialization for variable-length integers

Variable-length integers: bytes are a MSB base-128 encoding of the number.
The high bit in each byte signifies whether another digit follows. To make
the encoding is one-to-one, one is subtracted from all but the last digit.
Thus, the byte sequence a[] with length len, where all but the last byte
has bit 128 set, encodes the number:

  (a[len-1] & 0x7F) + sum(i=1..len-1, 128^i*((a[len-i-1] & 0x7F)+1))

Properties:
* Very small (0-127: 1 byte, 128-16511: 2 bytes, 16512-2113663: 3 bytes)
* Every integer has exactly one encoding
* Encoding does not depend on size of original integer type
This commit is contained in:
Pieter Wuille
2012-06-15 14:19:11 +02:00
parent 43b7905e98
commit 4d6144f97f
2 changed files with 138 additions and 1 deletions

View File

@@ -238,9 +238,75 @@ uint64 ReadCompactSize(Stream& is)
return nSizeRet;
}
// Variable-length integers: bytes are a MSB base-128 encoding of the number.
// The high bit in each byte signifies whether another digit follows. To make
// the encoding is one-to-one, one is subtracted from all but the last digit.
// Thus, the byte sequence a[] with length len, where all but the last byte
// has bit 128 set, encodes the number:
//
// (a[len-1] & 0x7F) + sum(i=1..len-1, 128^i*((a[len-i-1] & 0x7F)+1))
//
// Properties:
// * Very small (0-127: 1 byte, 128-16511: 2 bytes, 16512-2113663: 3 bytes)
// * Every integer has exactly one encoding
// * Encoding does not depend on size of original integer type
// * No redundancy: every (infinite) byte sequence corresponds to a list
// of encoded integers.
//
// 0: [0x00] 256: [0x81 0x00]
// 1: [0x01] 16383: [0xFE 0x7F]
// 127: [0x7F] 16384: [0xFF 0x00]
// 128: [0x80 0x00] 16511: [0x80 0xFF 0x7F]
// 255: [0x80 0x7F] 65535: [0x82 0xFD 0x7F]
// 2^32: [0x8E 0xFE 0xFE 0xFF 0x00]
template<typename I>
inline unsigned int GetSizeOfVarInt(I n)
{
int nRet = 0;
while(true) {
nRet++;
if (n <= 0x7F)
break;
n = (n >> 7) - 1;
}
return nRet;
}
#define FLATDATA(obj) REF(CFlatData((char*)&(obj), (char*)&(obj) + sizeof(obj)))
template<typename Stream, typename I>
void WriteVarInt(Stream& os, I n)
{
unsigned char tmp[(sizeof(n)*8+6)/7];
int len=0;
while(true) {
tmp[len] = (n & 0x7F) | (len ? 0x80 : 0x00);
if (n <= 0x7F)
break;
n = (n >> 7) - 1;
len++;
}
do {
WRITEDATA(os, tmp[len]);
} while(len--);
}
template<typename Stream, typename I>
I ReadVarInt(Stream& is)
{
I n = 0;
while(true) {
unsigned char chData;
READDATA(is, chData);
n = (n << 7) | (chData & 0x7F);
if (chData & 0x80)
n++;
else
return n;
}
}
#define FLATDATA(obj) REF(CFlatData((char*)&(obj), (char*)&(obj) + sizeof(obj)))
#define VARINT(obj) REF(WrapVarInt(REF(obj)))
/** Wrapper for serializing arrays and POD.
*/
@@ -274,6 +340,32 @@ public:
}
};
template<typename I>
class CVarInt
{
protected:
I &n;
public:
CVarInt(I& nIn) : n(nIn) { }
unsigned int GetSerializeSize(int, int) const {
return GetSizeOfVarInt<I>(n);
}
template<typename Stream>
void Serialize(Stream &s, int, int) const {
WriteVarInt<Stream,I>(s, n);
}
template<typename Stream>
void Unserialize(Stream& s, int, int) {
n = ReadVarInt<Stream,I>(s);
}
};
template<typename I>
CVarInt<I> WrapVarInt(I& n) { return CVarInt<I>(n); }
//
// Forward declarations
//