intset 实现了一个数字元素的集合。
使用数组和元素的有序存放实现存取,查找过程使用二分查找法,所有插入删除的的效率为O(log2N)。 与其他数据结构类似,作者使用变编码方式实现对内存的高效利用。 初始化的intset中的数字定义为int16_t,即每个元素占用2个字节,而随着数据的插入,逐渐调整编码方式到int32_t或int64_t
上代码
intset.h
1 #ifndef __INTSET_H 2 #define __INTSET_H 3 #include4 5 typedef struct intset { 6 uint32_t encoding; 7 uint32_t length; 8 int8_t contents[]; 9 } intset;10 11 intset *intsetNew(void);12 intset *intsetAdd(intset *is, int64_t value, uint8_t *success);13 intset *intsetRemove(intset *is, int64_t value, int *success);14 uint8_t intsetFind(intset *is, int64_t value);15 int64_t intsetRandom(intset *is);16 uint8_t intsetGet(intset *is, uint32_t pos, int64_t *value);17 uint32_t intsetLen(intset *is);18 19 #endif // __INTSET_H
intset.c
1 #include2 #include 3 #include 4 #include "intset.h" 5 #include "zmalloc.h" 6 7 /* Note that these encodings are ordered, so: 8 * INTSET_ENC_INT16 < INTSET_ENC_INT32 < INTSET_ENC_INT64. */ 9 #define INTSET_ENC_INT16 (sizeof(int16_t)) 10 #define INTSET_ENC_INT32 (sizeof(int32_t)) 11 #define INTSET_ENC_INT64 (sizeof(int64_t)) 12 13 /* Return the required encoding for the provided value. */ 14 //根据v的值判断使用哪儿种int以节省内存 15 static uint8_t _intsetValueEncoding(int64_t v) { 16 if (v < INT32_MIN || v > INT32_MAX) 17 return INTSET_ENC_INT64; 18 else if (v < INT16_MIN || v > INT16_MAX) 19 return INTSET_ENC_INT32; 20 return INTSET_ENC_INT16; 21 } 22 23 //得到enc编码方式下的第pos个位置的值 24 /* Return the value at pos, given an encoding. */ 25 static int64_t _intsetGetEncoded(intset *is, int pos, uint8_t enc) { 26 if (enc == INTSET_ENC_INT64) 27 return ((int64_t*)is->contents)[pos]; 28 else if (enc == INTSET_ENC_INT32) 29 return ((int32_t*)is->contents)[pos]; 30 return ((int16_t*)is->contents)[pos]; 31 } 32 33 //得到is中第pos个位置的value 34 /* Return the value at pos, using the configured encoding. */ 35 static int64_t _intsetGet(intset *is, int pos) { 36 return _intsetGetEncoded(is,pos,is->encoding); 37 } 38 39 //在第n个pos个位置存放value 40 /* Set the value at pos, using the configured encoding. */ 41 static void _intsetSet(intset *is, int pos, int64_t value) { 42 if (is->encoding == INTSET_ENC_INT64) 43 ((int64_t*)is->contents)[pos] = value; 44 else if (is->encoding == INTSET_ENC_INT32) 45 ((int32_t*)is->contents)[pos] = value; 46 else 47 ((int16_t*)is->contents)[pos] = value; 48 } 49 50 //得到一个新的intset,编码采用INTSET_ENC_INT16 51 /* Create an empty intset. */ 52 intset *intsetNew(void) { 53 intset *is = zmalloc(sizeof(intset)); 54 is->encoding = INTSET_ENC_INT16; 55 is->length = 0; 56 return is; 57 } 58 59 /* Resize the intset */ 60 //重新为intset分配内存 61 static intset *intsetResize(intset *is, uint32_t len) { 62 uint32_t size = len*is->encoding; 63 is = zrealloc(is,sizeof(intset)+size); 64 return is; 65 } 66 67 68 //在排好序的iniset中查找value 69 //如果找到,返回1,*pos为其所在位置 70 //如果找不到,返回0,*pos为其能插入的位置 71 //二分查找法 72 /* Search for the position of "value". Return 1 when the value was found and 73 * sets "pos" to the position of the value within the intset. Return 0 when 74 * the value is not present in the intset and sets "pos" to the position 75 * where "value" can be inserted. */ 76 static uint8_t intsetSearch(intset *is, int64_t value, uint32_t *pos) { 77 int min = 0, max = is->length-1, mid = -1; 78 int64_t cur = -1; 79 80 /* The value can never be found when the set is empty */ 81 if (is->length == 0) { 82 if (pos) *pos = 0; 83 return 0; 84 } else { 85 /* Check for the case where we know we cannot find the value, 86 * but do know the insert position. */ 87 //如果插入值大于最后一个元素,或者小于第一个元素,则可以认定无法找到该元素 88 if (value > _intsetGet(is,is->length-1)) { 89 if (pos) *pos = is->length; 90 return 0; 91 } else if (value < _intsetGet(is,0)) { 92 if (pos) *pos = 0; 93 return 0; 94 } 95 } 96 97 while(max >= min) { 98 mid = (min+max)/2; 99 cur = _intsetGet(is,mid);100 if (value > cur) {101 min = mid+1;102 } else if (value < cur) {103 max = mid-1;104 } else {105 break;106 }107 }108 109 if (value == cur) {110 if (pos) *pos = mid;111 return 1;112 } else {113 if (pos) *pos = min;114 return 0;115 }116 }117 118 //升级intset的编码,并且出入一个新数字,因为新数字的绝对值一定大于119 //当前intset中所有的数字的绝对值,如果是负数,则最小,放在最前边,否则放在最后边120 /* Upgrades the intset to a larger encoding and inserts the given integer. */121 static intset *intsetUpgradeAndAdd(intset *is, int64_t value) {122 uint8_t curenc = is->encoding;123 uint8_t newenc = _intsetValueEncoding(value);124 int length = is->length;125 int prepend = value < 0 ? 1 : 0;126 127 /* First set new encoding and resize */128 is->encoding = newenc;129 is = intsetResize(is,is->length+1);130 131 /* Upgrade back-to-front so we don't overwrite values.132 * Note that the "prepend" variable is used to make sure we have an empty133 * space at either the beginning or the end of the intset. */134 while(length--)135 _intsetSet(is,length+prepend,_intsetGetEncoded(is,length,curenc));136 137 /* Set the value at the beginning or the end. */138 if (prepend)139 _intsetSet(is,0,value);140 else141 _intsetSet(is,is->length,value);142 is->length++;143 return is;144 }145 146 //把from开始直到最后intset最后的内容move到to开始的地方,在这之前,应该要resize一下147 static void intsetMoveTail(intset *is, uint32_t from, uint32_t to) {148 void *src, *dst;149 uint32_t bytes = is->length-from;150 if (is->encoding == INTSET_ENC_INT64) {151 src = (int64_t*)is->contents+from;152 dst = (int64_t*)is->contents+to;153 bytes *= sizeof(int64_t);154 } else if (is->encoding == INTSET_ENC_INT32) {155 src = (int32_t*)is->contents+from;156 dst = (int32_t*)is->contents+to;157 bytes *= sizeof(int32_t);158 } else {159 src = (int16_t*)is->contents+from;160 dst = (int16_t*)is->contents+to;161 bytes *= sizeof(int16_t);162 }163 memmove(dst,src,bytes);164 }165 166 //在intset中插入一个元素,如果返回0,表示已经存在,否则返回1167 /* Insert an integer in the intset */168 intset *intsetAdd(intset *is, int64_t value, uint8_t *success) {169 uint8_t valenc = _intsetValueEncoding(value);170 uint32_t pos;171 if (success) *success = 1;172 173 /* Upgrade encoding if necessary. If we need to upgrade, we know that174 * this value should be either appended (if > 0) or prepended (if < 0),175 * because it lies outside the range of existing values. */176 if (valenc > is->encoding) {177 /* This always succeeds, so we don't need to curry *success. */178 return intsetUpgradeAndAdd(is,value);179 } else {180 /* Abort if the value is already present in the set.181 * This call will populate "pos" with the right position to insert182 * the value when it cannot be found. */183 if (intsetSearch(is,value,&pos)) {184 if (success) *success = 0;185 return is;186 }187 188 is = intsetResize(is,is->length+1);189 if (pos < is->length) intsetMoveTail(is,pos,pos+1);190 }191 192 _intsetSet(is,pos,value);193 is->length++;194 return is;195 }196 197 //在intset中删除一个元素value198 /* Delete integer from intset */199 intset *intsetRemove(intset *is, int64_t value, int *success) {200 uint8_t valenc = _intsetValueEncoding(value);201 uint32_t pos;202 if (success) *success = 0;203 204 if (valenc <= is->encoding && intsetSearch(is,value,&pos)) {205 /* We know we can delete */206 if (success) *success = 1;207 208 /* Overwrite value with tail and update length */209 if (pos < (is->length-1)) intsetMoveTail(is,pos+1,pos);210 is = intsetResize(is,is->length-1);211 is->length--;212 }213 return is;214 }215 216 //判断value是否在is中217 /* Determine whether a value belongs to this set */218 uint8_t intsetFind(intset *is, int64_t value) {219 uint8_t valenc = _intsetValueEncoding(value);220 return valenc <= is->encoding && intsetSearch(is,value,NULL);221 }222 223 //随机取一个intset中的元素224 /* Return random member */225 int64_t intsetRandom(intset *is) {226 return _intsetGet(is,rand()%is->length);227 }228 229 //把pos位置上的元素取出,如果超出位置,返回0,找到了则返回1,*value为其值230 /* Sets the value to the value at the given position. When this position is231 * out of range the function returns 0, when in range it returns 1. */232 uint8_t intsetGet(intset *is, uint32_t pos, int64_t *value) {233 if (pos < is->length) {234 *value = _intsetGet(is,pos);235 return 1;236 }237 return 0;238 }239 240 //得到intset的元素数量241 /* Return intset length */242 uint32_t intsetLen(intset *is) {243 return is->length;244 }245 246 #ifdef INTSET_TEST_MAIN247 #include 248 249 void intsetRepr(intset *is) {250 int i;251 for (i = 0; i < is->length; i++) {252 printf("%lld\n", (uint64_t)_intsetGet(is,i));253 }254 printf("\n");255 }256 257 void error(char *err) {258 printf("%s\n", err);259 exit(1);260 }261 262 void ok(void) {263 printf("OK\n");264 }265 266 long long usec(void) {267 struct timeval tv;268 gettimeofday(&tv,NULL);269 return (((long long)tv.tv_sec)*1000000)+tv.tv_usec;270 }271 272 #define assert(_e) ((_e)?(void)0:(_assert(#_e,__FILE__,__LINE__),exit(1)))273 void _assert(char *estr, char *file, int line) {274 printf("\n\n=== ASSERTION FAILED ===\n");275 printf("==> %s:%d '%s' is not true\n",file,line,estr);276 }277 278 intset *createSet(int bits, int size) {279 uint64_t mask = (1< 32) {285 value = (rand()*rand()) & mask;286 } else {287 value = rand() & mask;288 }289 is = intsetAdd(is,value,NULL);290 }291 return is;292 }293 294 void checkConsistency(intset *is) {295 int i;296 297 for (i = 0; i < (is->length-1); i++) {298 if (is->encoding == INTSET_ENC_INT16) {299 int16_t *i16 = (int16_t*)is->contents;300 assert(i16[i] < i16[i+1]);301 } else if (is->encoding == INTSET_ENC_INT32) {302 int32_t *i32 = (int32_t*)is->contents;303 assert(i32[i] < i32[i+1]);304 } else {305 int64_t *i64 = (int64_t*)is->contents;306 assert(i64[i] < i64[i+1]);307 }308 }309 }310 311 int main(int argc, char **argv) {312 uint8_t success;313 int i;314 intset *is;315 sranddev();316 317 printf("Value encodings: "); {318 assert(_intsetValueEncoding(-32768) == INTSET_ENC_INT16);319 assert(_intsetValueEncoding(+32767) == INTSET_ENC_INT16);320 assert(_intsetValueEncoding(-32769) == INTSET_ENC_INT32);321 assert(_intsetValueEncoding(+32768) == INTSET_ENC_INT32);322 assert(_intsetValueEncoding(-2147483648) == INTSET_ENC_INT32);323 assert(_intsetValueEncoding(+2147483647) == INTSET_ENC_INT32);324 assert(_intsetValueEncoding(-2147483649) == INTSET_ENC_INT64);325 assert(_intsetValueEncoding(+2147483648) == INTSET_ENC_INT64);326 assert(_intsetValueEncoding(-9223372036854775808ull) == INTSET_ENC_INT64);327 assert(_intsetValueEncoding(+9223372036854775807ull) == INTSET_ENC_INT64);328 ok();329 }330 331 printf("Basic adding: "); {332 is = intsetNew();333 is = intsetAdd(is,5,&success); assert(success);334 is = intsetAdd(is,6,&success); assert(success);335 is = intsetAdd(is,4,&success); assert(success);336 is = intsetAdd(is,4,&success); assert(!success);337 ok();338 }339 340 printf("Large number of random adds: "); {341 int inserts = 0;342 is = intsetNew();343 for (i = 0; i < 1024; i++) {344 is = intsetAdd(is,rand()%0x800,&success);345 if (success) inserts++;346 }347 assert(is->length == inserts);348 checkConsistency(is);349 ok();350 }351 352 printf("Upgrade from int16 to int32: "); {353 is = intsetNew();354 is = intsetAdd(is,32,NULL);355 assert(is->encoding == INTSET_ENC_INT16);356 is = intsetAdd(is,65535,NULL);357 assert(is->encoding == INTSET_ENC_INT32);358 assert(intsetFind(is,32));359 assert(intsetFind(is,65535));360 checkConsistency(is);361 362 is = intsetNew();363 is = intsetAdd(is,32,NULL);364 assert(is->encoding == INTSET_ENC_INT16);365 is = intsetAdd(is,-65535,NULL);366 assert(is->encoding == INTSET_ENC_INT32);367 assert(intsetFind(is,32));368 assert(intsetFind(is,-65535));369 checkConsistency(is);370 ok();371 }372 373 printf("Upgrade from int16 to int64: "); {374 is = intsetNew();375 is = intsetAdd(is,32,NULL);376 assert(is->encoding == INTSET_ENC_INT16);377 is = intsetAdd(is,4294967295,NULL);378 assert(is->encoding == INTSET_ENC_INT64);379 assert(intsetFind(is,32));380 assert(intsetFind(is,4294967295));381 checkConsistency(is);382 383 is = intsetNew();384 is = intsetAdd(is,32,NULL);385 assert(is->encoding == INTSET_ENC_INT16);386 is = intsetAdd(is,-4294967295,NULL);387 assert(is->encoding == INTSET_ENC_INT64);388 assert(intsetFind(is,32));389 assert(intsetFind(is,-4294967295));390 checkConsistency(is);391 ok();392 }393 394 printf("Upgrade from int32 to int64: "); {395 is = intsetNew();396 is = intsetAdd(is,65535,NULL);397 assert(is->encoding == INTSET_ENC_INT32);398 is = intsetAdd(is,4294967295,NULL);399 assert(is->encoding == INTSET_ENC_INT64);400 assert(intsetFind(is,65535));401 assert(intsetFind(is,4294967295));402 checkConsistency(is);403 404 is = intsetNew();405 is = intsetAdd(is,65535,NULL);406 assert(is->encoding == INTSET_ENC_INT32);407 is = intsetAdd(is,-4294967295,NULL);408 assert(is->encoding == INTSET_ENC_INT64);409 assert(intsetFind(is,65535));410 assert(intsetFind(is,-4294967295));411 checkConsistency(is);412 ok();413 }414 415 printf("Stress lookups: "); {416 long num = 100000, size = 10000;417 int i, bits = 20;418 long long start;419 is = createSet(bits,size);420 checkConsistency(is);421 422 start = usec();423 for (i = 0; i < num; i++) intsetSearch(is,rand() % ((1<