sdbf  3.3
 All Classes Functions Variables Friends
sdbf/sdbf_class.h
00001 // Header file for sdbf object
00002 //
00003 #ifndef _SDBF_CLASS_H 
00004 #define _SDBF_CLASS_H
00005 
00006 
00007 #include "sdbf_defines.h"
00008 #include "sdbf_conf.h"
00009 #include "bloom_filter.h"
00010 
00011 #include <stdint.h>
00012 #include <stdio.h>
00013 #include <iostream>
00014 #include <sstream>
00015 #include <string>
00016 
00017 using namespace std;
00018 
00019 /**
00020     sdbf:  a Similarity Digest Bloom Filter class.
00021 */
00022 /// sdbf class
00023 class sdbf {
00024 
00025     friend std::ostream& operator<<(std::ostream& os, const sdbf& s); ///< output operator
00026     friend std::ostream& operator<<(std::ostream& os, const sdbf *s); ///< output operator
00027 
00028     /** \example sdbf_test.cc
00029     *  A very short example program using sdbf.
00030     */
00031 
00032 public:
00033     /// to read formatted sdbfs from open file pointer
00034     sdbf(FILE *in); 
00035     /// to create new from a single file
00036     sdbf(const char *filename, uint32_t dd_block_size); 
00037     /// to create by reading from an open stream
00038     sdbf(const char *name, std::istream *ifs, uint32_t dd_block_size, uint64_t msize, index_info *info) ; 
00039     /// to create from a c-string
00040     sdbf(const char *name, char *str, uint32_t dd_block_size, uint64_t length, index_info *info);
00041     /// destructor
00042     ~sdbf(); 
00043 
00044     /// object name
00045     const char *name();  
00046     /// object size
00047     uint64_t size();  
00048     /// source object size
00049     uint64_t input_size();  
00050 
00051     /// matching algorithm, take other object and run match
00052     int32_t compare(sdbf *other, uint32_t sample);
00053 
00054     /// return a string representation of this sdbf
00055     string to_string() const ; 
00056 
00057     /// return results of index search
00058     string get_index_results() const; 
00059 
00060     /// return a copy of an individual bloom filter from this sdbf
00061     uint8_t *clone_filter(uint32_t position);
00062     uint32_t filter_count();
00063 
00064 public:
00065     /// global configuration object
00066     static class sdbf_conf *config;  
00067     static int32_t get_elem_count(sdbf *mine, uint64_t index) ;
00068 
00069 private:
00070 
00071     int compute_hamming();
00072     void sdbf_create(const char *filename);
00073    // static int32_t get_elem_count(sdbf *mine, uint64_t index) ;
00074 
00075     // from sdbf_core.c: Core SDBF generation/comparison functions
00076     static void gen_chunk_ranks( uint8_t *file_buffer, const uint64_t chunk_size, uint16_t *chunk_ranks, uint16_t carryover);
00077     static void gen_chunk_scores( const uint16_t *chunk_ranks, const uint64_t chunk_size, uint16_t *chunk_scores, int32_t *score_histo);
00078     void gen_chunk_hash( uint8_t *file_buffer, const uint64_t chunk_pos, const uint16_t *chunk_scores, const uint64_t chunk_size);
00079     static void gen_block_hash( uint8_t *file_buffer, uint64_t file_size, const uint64_t block_num, const uint16_t *chunk_scores, const uint64_t block_size, class sdbf *hashto,uint32_t rem, uint32_t threshold, int32_t allowed);
00080     void gen_chunk_sdbf( uint8_t *file_buffer, uint64_t file_size, uint64_t chunk_size);
00081     void gen_block_sdbf_mt( uint8_t *file_buffer, uint64_t file_size, uint64_t block_size, uint32_t thread_cnt);
00082 
00083     static void *thread_gen_block_sdbf( void *task_param);
00084     static int     sdbf_score( sdbf *sd_1, sdbf *sd_2, uint32_t sample);
00085     static double  sdbf_max_score( sdbf_task_t *task);
00086 
00087     void print_indexes(uint32_t threshold, vector<uint32_t> *matches, uint64_t pos);
00088     void reset_indexes(vector<uint32_t> *matches);
00089     bool check_indexes(uint32_t* sha1, vector<uint32_t> *matches);
00090     bool is_block_null(uint8_t *buffer, uint32_t size);
00091     
00092 public:
00093     uint8_t  *buffer;        // Beginning of the BF cluster
00094     uint16_t *hamming;       // Hamming weight for each BF
00095    // uint16_t *elem_counts;   // Individual elements counts for each BF (used in dd mode)
00096     uint32_t  max_elem;      // Max number of elements per filter (n)
00097 private:
00098     index_info *info;
00099     string index_results;
00100         
00101     // from the C structure 
00102     char *hashname;          // name (usually, source file)
00103     uint32_t  bf_count;      // Number of BFs
00104     uint32_t  bf_size;       // BF size in bytes (==m/8)
00105     uint32_t  hash_count;    // Number of hash functions used (k)
00106     uint32_t  mask;          // Bit mask used (must agree with m)
00107     uint32_t  last_count;    // Actual number of elements in last filter (n_last); 
00108                                                          // ZERO means look at elem_counts value 
00109     //uint8_t  *buffer;        // Beginning of the BF cluster
00110     //uint16_t *hamming;       // Hamming weight for each BF
00111     uint16_t *elem_counts;   // Individual elements counts for each BF (used in dd mode)
00112     uint32_t  dd_block_size; // Size of the base block in dd mode
00113     uint64_t orig_file_size; // size of the original file
00114     bool     filenamealloc;
00115 
00116 };
00117 
00118 #endif