sdbf
3.3
|
00001 // Header file for sdbf object 00002 // 00003 #ifndef _SDBF_CLASS_H 00004 #define _SDBF_CLASS_H 00005 00006 00007 #include "sdbf_defines.h" 00008 #include "sdbf_conf.h" 00009 #include "bloom_filter.h" 00010 00011 #include <stdint.h> 00012 #include <stdio.h> 00013 #include <iostream> 00014 #include <sstream> 00015 #include <string> 00016 00017 using namespace std; 00018 00019 /** 00020 sdbf: a Similarity Digest Bloom Filter class. 00021 */ 00022 /// sdbf class 00023 class sdbf { 00024 00025 friend std::ostream& operator<<(std::ostream& os, const sdbf& s); ///< output operator 00026 friend std::ostream& operator<<(std::ostream& os, const sdbf *s); ///< output operator 00027 00028 /** \example sdbf_test.cc 00029 * A very short example program using sdbf. 00030 */ 00031 00032 public: 00033 /// to read formatted sdbfs from open file pointer 00034 sdbf(FILE *in); 00035 /// to create new from a single file 00036 sdbf(const char *filename, uint32_t dd_block_size); 00037 /// to create by reading from an open stream 00038 sdbf(const char *name, std::istream *ifs, uint32_t dd_block_size, uint64_t msize, index_info *info) ; 00039 /// to create from a c-string 00040 sdbf(const char *name, char *str, uint32_t dd_block_size, uint64_t length, index_info *info); 00041 /// destructor 00042 ~sdbf(); 00043 00044 /// object name 00045 const char *name(); 00046 /// object size 00047 uint64_t size(); 00048 /// source object size 00049 uint64_t input_size(); 00050 00051 /// matching algorithm, take other object and run match 00052 int32_t compare(sdbf *other, uint32_t sample); 00053 00054 /// return a string representation of this sdbf 00055 string to_string() const ; 00056 00057 /// return results of index search 00058 string get_index_results() const; 00059 00060 /// return a copy of an individual bloom filter from this sdbf 00061 uint8_t *clone_filter(uint32_t position); 00062 uint32_t filter_count(); 00063 00064 public: 00065 /// global configuration object 00066 static class sdbf_conf *config; 00067 static int32_t get_elem_count(sdbf *mine, uint64_t index) ; 00068 00069 private: 00070 00071 int compute_hamming(); 00072 void sdbf_create(const char *filename); 00073 // static int32_t get_elem_count(sdbf *mine, uint64_t index) ; 00074 00075 // from sdbf_core.c: Core SDBF generation/comparison functions 00076 static void gen_chunk_ranks( uint8_t *file_buffer, const uint64_t chunk_size, uint16_t *chunk_ranks, uint16_t carryover); 00077 static void gen_chunk_scores( const uint16_t *chunk_ranks, const uint64_t chunk_size, uint16_t *chunk_scores, int32_t *score_histo); 00078 void gen_chunk_hash( uint8_t *file_buffer, const uint64_t chunk_pos, const uint16_t *chunk_scores, const uint64_t chunk_size); 00079 static void gen_block_hash( uint8_t *file_buffer, uint64_t file_size, const uint64_t block_num, const uint16_t *chunk_scores, const uint64_t block_size, class sdbf *hashto,uint32_t rem, uint32_t threshold, int32_t allowed); 00080 void gen_chunk_sdbf( uint8_t *file_buffer, uint64_t file_size, uint64_t chunk_size); 00081 void gen_block_sdbf_mt( uint8_t *file_buffer, uint64_t file_size, uint64_t block_size, uint32_t thread_cnt); 00082 00083 static void *thread_gen_block_sdbf( void *task_param); 00084 static int sdbf_score( sdbf *sd_1, sdbf *sd_2, uint32_t sample); 00085 static double sdbf_max_score( sdbf_task_t *task); 00086 00087 void print_indexes(uint32_t threshold, vector<uint32_t> *matches, uint64_t pos); 00088 void reset_indexes(vector<uint32_t> *matches); 00089 bool check_indexes(uint32_t* sha1, vector<uint32_t> *matches); 00090 bool is_block_null(uint8_t *buffer, uint32_t size); 00091 00092 public: 00093 uint8_t *buffer; // Beginning of the BF cluster 00094 uint16_t *hamming; // Hamming weight for each BF 00095 // uint16_t *elem_counts; // Individual elements counts for each BF (used in dd mode) 00096 uint32_t max_elem; // Max number of elements per filter (n) 00097 private: 00098 index_info *info; 00099 string index_results; 00100 00101 // from the C structure 00102 char *hashname; // name (usually, source file) 00103 uint32_t bf_count; // Number of BFs 00104 uint32_t bf_size; // BF size in bytes (==m/8) 00105 uint32_t hash_count; // Number of hash functions used (k) 00106 uint32_t mask; // Bit mask used (must agree with m) 00107 uint32_t last_count; // Actual number of elements in last filter (n_last); 00108 // ZERO means look at elem_counts value 00109 //uint8_t *buffer; // Beginning of the BF cluster 00110 //uint16_t *hamming; // Hamming weight for each BF 00111 uint16_t *elem_counts; // Individual elements counts for each BF (used in dd mode) 00112 uint32_t dd_block_size; // Size of the base block in dd mode 00113 uint64_t orig_file_size; // size of the original file 00114 bool filenamealloc; 00115 00116 }; 00117 00118 #endif