Skip to content

Commit 871c8e6

Browse files
author
lsleonard
committed
Improve bit handling input for encoding
1. In tdString.c, moved the inline functions for bit output to td64_internal.h where they can also be used by functions in td64.c. 2. In td64.c, implemented bit output improvements for encode AdaptiveTextMode and encodeStringMode.
1 parent 265f388 commit 871c8e6

7 files changed

+85
-87
lines changed

Tiny Data Compression with td512.docx

397 Bytes
Binary file not shown.

main.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ int32_t test_td512_1to512(void)
7878
int main(int argc, char* argv[])
7979
{
8080
FILE *ifile, *ofile;
81-
char ofileName[256];
81+
char ofileName[FILENAME_MAX];
8282
unsigned char *src, *dst;
8383
size_t len, len2, len3;
8484
clock_t begin, end;
@@ -127,7 +127,7 @@ int main(int argc, char* argv[])
127127
src = (unsigned char*) malloc(len);
128128
fread(src, 1, len, ifile);
129129
fclose(ifile);
130-
130+
131131
// allocate "uncompressed size" + 3 bytes per block for the destination buffer
132132
dst = (unsigned char*) malloc(len + 4 * (len / blockSize + 1));
133133
if (argc >= 3)

td512.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,11 @@
6161
2. In td64.c, decodeAdaptiveTextMode, made dtbmThisVal internal to function rather than static global. Added a read-ahead byte so that peaking at bits does not require an extra read. The read-ahead means main loop must stop three values early to avoid reading beyond end of input values, and those three values must be processed without read-ahead.
6262
3. In td64.c, decodeStringMode, made dsmThisVal internal to function rather than static global.
6363
*/
64+
// Notes for version 2.1.8:
65+
/*
66+
1. In tdString.c, moved the inline functions for bit output to td64_internal.h where they can also be used by functions in td64.c.
67+
2. In td64.c, implemented bit output improvements for encode AdaptiveTextMode and encodeStringMode.
68+
*/
6469
#ifndef td512_h
6570
#define td512_h
6671

td64.c

Lines changed: 25 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -583,18 +583,6 @@ int32_t td5d(const unsigned char *inVals, unsigned char *outVals, const uint32_t
583583
}
584584
} // end td5d
585585

586-
static inline void esmOutputBits(unsigned char *outVals, const uint32_t nBits, const uint32_t bitVal, uint32_t *nextOutIx, uint32_t *nextOutBit)
587-
{
588-
// output 1 to 8 bits
589-
outVals[*nextOutIx] |= (unsigned char)(bitVal << *nextOutBit);
590-
*nextOutBit += nBits;
591-
if (*nextOutBit >= 8)
592-
{
593-
*nextOutBit -= 8;
594-
outVals[++(*nextOutIx)] = (unsigned char)bitVal >> (nBits - *nextOutBit);
595-
}
596-
} // end esmOutputBits
597-
598586
static uint32_t textNBitsTable[MAX_PREDEFINED_FREQUENCY_CHAR_COUNT]={
599587
3, 3, 4, 4,
600588
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
@@ -704,6 +692,7 @@ int32_t encodeAdaptiveTextMode(const unsigned char *inVals, unsigned char *outVa
704692
uint32_t inVal;
705693
uint32_t nextOutIx=1;
706694
uint32_t nextOutBit=0;
695+
uint64_t outBits=0; // store 64 bits before writing
707696
uint32_t eVal;
708697
const uint32_t *textEncodingArray=extendedTextEncoding;
709698
const uint32_t output7or8=highBitclear ? 7 : 8;
@@ -714,22 +703,21 @@ int32_t encodeAdaptiveTextMode(const unsigned char *inVals, unsigned char *outVa
714703
setAdaptiveChars(val256, outVals, nValues, &textEncodingArray);
715704
if (highBitclear)
716705
outVals[0] |= 128; // set high bit of info byte to indicate 7-bit values
717-
outVals[1] = 0; // init first value used by esmOutputBits
718706
while (pInVal < pLastInValPlusOne)
719707
{
720708
eVal=textEncodingArray[(inVal=(unsigned char)*(pInVal++))];
721709
if (eVal < MAX_PREDEFINED_FREQUENCY_CHAR_COUNT)
722710
{
723711
// encode predefined chars and adaptive chars
724-
esmOutputBits(outVals, textNBitsTable[eVal], textBitValTable[eVal], &nextOutIx, &nextOutBit);
712+
thisOutIx2(outVals, textNBitsTable[eVal], textBitValTable[eVal], &nextOutIx, &nextOutBit, &outBits);
725713
}
726714
else
727715
{
728716
// output char not predefined or adaptive
729717
if (nextOutIx > maxBytes)
730718
return 0; // requested compression not met
731-
esmOutputBits(outVals, 3, 0x5, &nextOutIx, &nextOutBit);
732-
esmOutputBits(outVals, output7or8, inVal, &nextOutIx, &nextOutBit); // output 7 bits if high bit clear, else 8
719+
thisOutIx2(outVals, 3, 0x5, &nextOutIx, &nextOutBit, &outBits);
720+
thisOutIx2(outVals, output7or8, inVal, &nextOutIx, &nextOutBit, &outBits); // output 7 bits if high bit clear, else 8
733721
#ifdef TD64_TEST_MODE
734722
if (textEncodingArray == extendedTextEncoding)
735723
g_td64Text8bitCount++;
@@ -738,7 +726,8 @@ int32_t encodeAdaptiveTextMode(const unsigned char *inVals, unsigned char *outVa
738726
#endif
739727
}
740728
}
741-
return nextOutIx * 8 + nextOutBit;
729+
esmOutputRemainder(outVals, &nextOutIx, &nextOutBit, &outBits);
730+
return nextOutIx * 8;
742731
} // end encodeAdaptiveTextMode
743732

744733
int32_t encodeSingleValueMode(const unsigned char *inVals, unsigned char *outVals, const uint32_t nValues, int32_t singleValue, const uint32_t compressNSV)
@@ -940,6 +929,7 @@ int32_t encodeStringMode(const unsigned char *inVals, unsigned char *outVals, co
940929
//uint32_t nextOutIx=nUniquesIn + 1; // start of encoding past uniques written from outer loop;
941930
uint32_t nextOutIx; // round for now
942931
uint32_t nextOutBit=1; // first bit indicates 1 or 2 uniques in first two input values
932+
uint64_t outBits; // store 64 bits before writing
943933
const uint32_t maxBytes=maxBits/8; // compare against bytes out during loop
944934

945935
if (nUniquesIn > 32 || nUniquesIn < MIN_STRING_MODE_UNIQUES)
@@ -956,7 +946,6 @@ int32_t encodeStringMode(const unsigned char *inVals, unsigned char *outVals, co
956946
nextOutIx=nUniquesIn+1;
957947
outVals[0] = 1 | (unsigned char)((nUniquesIn-17)<<4); // indicate string mode in first 3 bits, 0 for uniques uncompressed, then number uniques - 17 (excess 16 as always 17+ values) in next 4 bits
958948
}
959-
outVals[nextOutIx] = 0; // init for esmOutputBits
960949
// output two initial values
961950
// first unique assumed
962951
const unsigned char inVal0=inVals[0];
@@ -965,15 +954,15 @@ int32_t encodeStringMode(const unsigned char *inVals, unsigned char *outVals, co
965954
// first two values are the same
966955
nUniques = 1;
967956
// output 1 to indicate first unique value repeated
968-
outVals[nextOutIx] = 1; // 1=repeat for second value
957+
outBits = 1; // 1=repeat for second value
969958
twoValsPos[inVal0] = 1;
970959
}
971960
else
972961
{
973962
// second val is a new unique
974963
nUniques = 2;
975964
// set up position of 2nd unique
976-
outVals[nextOutIx] = 0; // 0=uniques in first two values
965+
outBits = 0; // 0=uniques in first two values
977966
twoValsPos[inVal0] = 1;
978967
twoValsPos[inVals[1]] = 2;
979968
}
@@ -991,10 +980,11 @@ int32_t encodeStringMode(const unsigned char *inVals, unsigned char *outVals, co
991980
{
992981
// first occurrence of this unique
993982
// output a 0 to indicate new unique
994-
if (++nextOutBit == 8)
983+
if (++nextOutBit == 64)
995984
{
996-
// update out index and next out bit
997-
outVals[++nextOutIx] = 0;
985+
// output outBits and init for next output
986+
esmOutputOutBits(outVals, &nextOutIx, &outBits);
987+
outBits = 0;
998988
nextOutBit = 0;
999989
}
1000990
twoValsPos[inVal] = inPos;
@@ -1008,7 +998,7 @@ int32_t encodeStringMode(const unsigned char *inVals, unsigned char *outVals, co
1008998
{
1009999
// pos of unique plus one and next input value match
10101000
// output repeated value: 01 plus unique
1011-
esmOutputBits(outVals, 2+encodingBits[nUniques-1], 1|(uoInVal<<2), &nextOutIx, &nextOutBit);
1001+
thisOutIx2(outVals, 2+encodingBits[nUniques-1], 1|(uoInVal<<2), &nextOutIx, &nextOutBit, &outBits);
10121002
continue;
10131003
}
10141004
// look for continuation of matching characters
@@ -1025,22 +1015,21 @@ int32_t encodeStringMode(const unsigned char *inVals, unsigned char *outVals, co
10251015
tvPos++;
10261016
}
10271017
// output 11 plus 3 more bits for string length 2 to 9
1028-
esmOutputBits(outVals, 5, 3 | ((strCount-2)<<2), &nextOutIx, &nextOutBit);
1018+
thisOutIx2(outVals, 5, 3 | ((strCount-2)<<2), &nextOutIx, &nextOutBit, &outBits);
10291019
// output the unique that started this string, which gives its position
1030-
esmOutputBits(outVals, encodingBits[nUniques-1], uoInVal, &nextOutIx, &nextOutBit);
1020+
thisOutIx2(outVals, encodingBits[nUniques-1], uoInVal, &nextOutIx, &nextOutBit, &outBits);
10311021
inPos += strCount - 1;
10321022
nextInVal = inVals[inPos]; // new next val after string
10331023
}
10341024
else
10351025
{
10361026
// this pair doesn't match the one for first occurrence of this unique
10371027
// repeated value: 01
1038-
esmOutputBits(outVals, 2+encodingBits[nUniques-1], 1|(uoInVal<<2), &nextOutIx, &nextOutBit);
1028+
thisOutIx2(outVals, 2+encodingBits[nUniques-1], 1|(uoInVal<<2), &nextOutIx, &nextOutBit, &outBits);
10391029
}
10401030
}
1031+
esmOutputRemainder(outVals, &nextOutIx, &nextOutBit, &outBits);
10411032
// output final bits
1042-
if (nextOutBit > 0)
1043-
nextOutIx++; // index past final bits
10441033
if (inPos < nValues)
10451034
{
10461035
outVals[nextOutIx++] = inVals[lastPos]; // output last input byte
@@ -1524,19 +1513,19 @@ int32_t decodeAdaptiveTextMode(const unsigned char *inVals, unsigned char *outVa
15241513
{
15251514
// peak at the next 7 bits to decide what to do
15261515
dtbmPeekBits(7, bitPos, &theBits, &dtbmThisInVal);
1527-
if ((theBits & 7) == 5)
1516+
if ((theBits & 7) != 5)
1517+
{
1518+
// output the corresponding text char and skip corresponding number of bits
1519+
outVals[nextOutVal++] = (unsigned char)pTextChars[textDecodePos[theBits]];
1520+
dtbmSkipBits(inVals, textDecodeBits[theBits], &thisInValIx, &bitPos, &dtbmThisInVal);
1521+
}
1522+
else
15281523
{
15291524
// skip three bits, get 7 or 8 more bits and output original value
15301525
dtbmSkipBits(inVals, 3, &thisInValIx, &bitPos, &dtbmThisInVal);
15311526
dtbmGetBits(inVals, input7or8, &thisInValIx, &bitPos, &theBits, &dtbmThisInVal);
15321527
outVals[nextOutVal++] = (unsigned char)theBits;
15331528
}
1534-
else
1535-
{
1536-
// output the corresponding text char and skip corresponding number of bits
1537-
outVals[nextOutVal++] = (unsigned char)pTextChars[textDecodePos[theBits]];
1538-
dtbmSkipBits(inVals, textDecodeBits[theBits], &thisInValIx, &bitPos, &dtbmThisInVal);
1539-
}
15401529
}
15411530
// Process the last three values: requires at least 2 bytes (3*3) and up to 4 bytes (3*7)
15421531
uint32_t lastBits=dtbmThisInVal>>bitPos; // next value already read

td64.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
#define NDEBUG // disable asserts
2828
#include <assert.h>
2929

30-
#define TD64_VERSION "v2.1.7"
30+
#define TD64_VERSION "v2.1.8"
3131
#define MAX_TD64_BYTES 64 // max input vals supported
3232
#define MIN_TD64_BYTES 1 // min input vals supported
3333
#define MAX_UNIQUES 16 // max uniques supported in input

td64_internal.h

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,50 @@
2828
static const uint32_t encodingBits[64]={1,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6};
2929
static const uint32_t bitMask[]={0,1,3,7,15,31,63,127,255,511};
3030

31+
static inline void esmOutputRemainder(unsigned char *outValsT, uint32_t *thisOutIx, uint32_t *nextOutBit, uint64_t *outBits)
32+
{
33+
if (*nextOutBit == 0)
34+
return; // no bits to output
35+
uint32_t shiftPos=0;
36+
int32_t bitsRemaining=*nextOutBit-8;
37+
// output bits that remain
38+
outValsT[(*thisOutIx)++] = (unsigned char)*outBits;
39+
while (bitsRemaining > 0)
40+
{
41+
shiftPos += 8;
42+
outValsT[(*thisOutIx)++] = (unsigned char)(*outBits >> shiftPos);
43+
bitsRemaining -= 8;
44+
}
45+
*nextOutBit = 0;
46+
} // end esmOutputRemainder
47+
48+
static inline void esmOutputOutBits(unsigned char *outValsT, uint32_t *thisOutIx, uint64_t *outBits)
49+
{
50+
// copy 64 bits to output
51+
outValsT[(*thisOutIx)++] = (unsigned char)*outBits;
52+
outValsT[(*thisOutIx)++] = (unsigned char)(*outBits>>8);
53+
outValsT[(*thisOutIx)++] = (unsigned char)(*outBits>>16);
54+
outValsT[(*thisOutIx)++] = (unsigned char)(*outBits>>24);
55+
outValsT[(*thisOutIx)++] = (unsigned char)(*outBits>>32);
56+
outValsT[(*thisOutIx)++] = (unsigned char)(*outBits>>40);
57+
outValsT[(*thisOutIx)++] = (unsigned char)(*outBits>>48);
58+
outValsT[(*thisOutIx)++] = (unsigned char)(*outBits>>56);
59+
} // end esmOutputOutBits
60+
61+
static inline void thisOutIx2(unsigned char *outValsT, const uint32_t nBits, const uint64_t bitVal, uint32_t *thisOutIx, uint32_t *nextOutBit, uint64_t *outBits)
62+
{
63+
// output 1 to 64 bits
64+
*outBits |= bitVal << *nextOutBit;
65+
*nextOutBit += nBits;
66+
if (*nextOutBit >= 64)
67+
{
68+
esmOutputOutBits(outValsT, thisOutIx, outBits);
69+
// init outBits with remainder of bits from current output
70+
*nextOutBit -= 64;
71+
*outBits = bitVal >> (nBits - *nextOutBit);
72+
}
73+
} // end thisOutIx2
74+
3175
static int32_t encode7bitsInternal(const unsigned char *inVals, unsigned char *outVals, const uint32_t nValues)
3276
{
3377
// for internal use: output 7 bytes for each 8-byte group, then remaining bytes

tdString.c

Lines changed: 8 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -26,50 +26,6 @@
2626

2727
#define MAX_STRING_MODE_EXTENDED_VALUES 512
2828

29-
static inline void esmOutputRemainder(unsigned char *outValsT, uint32_t *thisOutIx, uint32_t *nextOutBit, uint64_t *outBits)
30-
{
31-
if (*nextOutBit == 0)
32-
return; // no bits to output
33-
uint32_t shiftPos=0;
34-
int32_t bitsRemaining=*nextOutBit-8;
35-
// output bits that remain
36-
outValsT[(*thisOutIx)++] = (unsigned char)*outBits;
37-
while (bitsRemaining > 0)
38-
{
39-
shiftPos += 8;
40-
outValsT[(*thisOutIx)++] = (unsigned char)(*outBits >> shiftPos);
41-
bitsRemaining -= 8;
42-
}
43-
*nextOutBit = 0;
44-
} // end esmOutputRemainder
45-
46-
static inline void esmOutputOutBits(unsigned char *outValsT, uint32_t *thisOutIx, uint64_t *outBits)
47-
{
48-
// copy 64 bits to output
49-
outValsT[(*thisOutIx)++] = (unsigned char)*outBits;
50-
outValsT[(*thisOutIx)++] = (unsigned char)(*outBits>>8);
51-
outValsT[(*thisOutIx)++] = (unsigned char)(*outBits>>16);
52-
outValsT[(*thisOutIx)++] = (unsigned char)(*outBits>>24);
53-
outValsT[(*thisOutIx)++] = (unsigned char)(*outBits>>32);
54-
outValsT[(*thisOutIx)++] = (unsigned char)(*outBits>>40);
55-
outValsT[(*thisOutIx)++] = (unsigned char)(*outBits>>48);
56-
outValsT[(*thisOutIx)++] = (unsigned char)(*outBits>>56);
57-
} // end esmOutputOutBits
58-
59-
static inline void thisOutIx2(unsigned char *outValsT, const uint32_t nBits, const uint64_t bitVal, uint32_t *thisOutIx, uint32_t *nextOutBit, uint64_t *outBits)
60-
{
61-
// output 1 to 64 bits
62-
*outBits |= bitVal << *nextOutBit;
63-
*nextOutBit += nBits;
64-
if (*nextOutBit >= 64)
65-
{
66-
esmOutputOutBits(outValsT, thisOutIx, outBits);
67-
// init outBits with remainder of bits from current output
68-
*nextOutBit -= 64;
69-
*outBits = bitVal >> (nBits - *nextOutBit);
70-
}
71-
} // end thisOutIx2
72-
7329
int32_t encodeExtendedStringMode(const unsigned char *inVals, unsigned char *outVals, const uint32_t nValuesMax, uint32_t *nValuesOut)
7430
{
7531
// Encode repeated strings and values in input until the 65th unique value,
@@ -492,11 +448,15 @@ int32_t decodeExtendedStringMode(const unsigned char *inVals, unsigned char *out
492448
dsmGetBits(inVals, extended_string_length_bits, &thisInVal, &thisVal, &bitPos, &theBits);
493449
uint32_t stringLen = (uint32_t)theBits + 2;
494450
assert(stringLen <= (extended_string_length_bits==3 ? 9:17));
495-
uint32_t nPosBits = encodingBits512[nextOutVal];
496-
if (nPosBits < 9)
497-
dsmGetBits(inVals, nPosBits, &thisInVal, &thisVal, &bitPos, &theBits); // 8 or fewer bits
451+
if (nextOutVal < 256)
452+
{
453+
const uint32_t nPosBits = encodingBits512[nextOutVal];
454+
dsmGetBits(inVals, nPosBits, &thisInVal, &thisVal, &bitPos, &theBits); // 8 bits
455+
}
498456
else
499-
dsmGetBits2(inVals, nPosBits, &thisInVal, &thisVal, &bitPos, &theBits); // 9 bits
457+
{
458+
dsmGetBits2(inVals, 9, &thisInVal, &thisVal, &bitPos, &theBits); // 9 bits
459+
}
500460
uint32_t stringPos=(uint32_t)theBits;
501461
assert((uint32_t)stringPos+stringLen <= nextOutVal);
502462
assert(nextOutVal+stringLen <= nOriginalValues);

0 commit comments

Comments
 (0)