| 1 | // -------------------------------------------------------------------------- |
|---|
| 2 | // |
|---|
| 3 | // File |
|---|
| 4 | // Name: BackupStoreFileDiff.cpp |
|---|
| 5 | // Purpose: Functions relating to diffing BackupStoreFiles |
|---|
| 6 | // Created: 12/1/04 |
|---|
| 7 | // |
|---|
| 8 | // -------------------------------------------------------------------------- |
|---|
| 9 | |
|---|
| 10 | #include "Box.h" |
|---|
| 11 | |
|---|
| 12 | #include <string.h> |
|---|
| 13 | |
|---|
| 14 | #include <new> |
|---|
| 15 | #include <map> |
|---|
| 16 | |
|---|
| 17 | #ifdef HAVE_TIME_H |
|---|
| 18 | #include <time.h> |
|---|
| 19 | #elif HAVE_SYS_TIME_H |
|---|
| 20 | #include <sys/time.h> |
|---|
| 21 | #endif |
|---|
| 22 | |
|---|
| 23 | #include "BackupStoreConstants.h" |
|---|
| 24 | #include "BackupStoreException.h" |
|---|
| 25 | #include "BackupStoreFile.h" |
|---|
| 26 | #include "BackupStoreFileCryptVar.h" |
|---|
| 27 | #include "BackupStoreFileEncodeStream.h" |
|---|
| 28 | #include "BackupStoreFileWire.h" |
|---|
| 29 | #include "BackupStoreObjectMagic.h" |
|---|
| 30 | #include "CommonException.h" |
|---|
| 31 | #include "FileStream.h" |
|---|
| 32 | #include "MD5Digest.h" |
|---|
| 33 | #include "RollingChecksum.h" |
|---|
| 34 | #include "Timer.h" |
|---|
| 35 | |
|---|
| 36 | #include "MemLeakFindOn.h" |
|---|
| 37 | |
|---|
| 38 | #include <cstring> |
|---|
| 39 | |
|---|
| 40 | using namespace BackupStoreFileCryptVar; |
|---|
| 41 | using namespace BackupStoreFileCreation; |
|---|
| 42 | |
|---|
| 43 | // By default, don't trace out details of the diff as we go along -- would fill up logs significantly. |
|---|
| 44 | // But it's useful for the test. |
|---|
| 45 | #ifndef BOX_RELEASE_BUILD |
|---|
| 46 | bool BackupStoreFile::TraceDetailsOfDiffProcess = false; |
|---|
| 47 | #endif |
|---|
| 48 | |
|---|
| 49 | static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntry **ppIndex, int64_t &rNumBlocksOut, int Timeout, bool &rCanDiffFromThis); |
|---|
| 50 | static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]); |
|---|
| 51 | static void SearchForMatchingBlocks(IOStream &rFile, |
|---|
| 52 | std::map<int64_t, int64_t> &rFoundBlocks, BlocksAvailableEntry *pIndex, |
|---|
| 53 | int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES], |
|---|
| 54 | DiffTimer *pDiffTimer); |
|---|
| 55 | static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable); |
|---|
| 56 | static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber, |
|---|
| 57 | BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks); |
|---|
| 58 | static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex, int64_t NumBlocks, std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile); |
|---|
| 59 | |
|---|
| 60 | // -------------------------------------------------------------------------- |
|---|
| 61 | // |
|---|
| 62 | // Function |
|---|
| 63 | // Name: BackupStoreFile::MoveStreamPositionToBlockIndex(IOStream &) |
|---|
| 64 | // Purpose: Move the file pointer in this stream to just before the block index. |
|---|
| 65 | // Assumes that the stream is at the beginning, seekable, and |
|---|
| 66 | // reading from the stream is OK. |
|---|
| 67 | // Created: 12/1/04 |
|---|
| 68 | // |
|---|
| 69 | // -------------------------------------------------------------------------- |
|---|
| 70 | void BackupStoreFile::MoveStreamPositionToBlockIndex(IOStream &rStream) |
|---|
| 71 | { |
|---|
| 72 | // Size of file |
|---|
| 73 | int64_t fileSize = rStream.BytesLeftToRead(); |
|---|
| 74 | |
|---|
| 75 | // Get header |
|---|
| 76 | file_StreamFormat hdr; |
|---|
| 77 | |
|---|
| 78 | // Read the header |
|---|
| 79 | if(!rStream.ReadFullBuffer(&hdr, sizeof(hdr), 0 /* not interested in bytes read if this fails */, IOStream::TimeOutInfinite)) |
|---|
| 80 | { |
|---|
| 81 | // Couldn't read header |
|---|
| 82 | THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream) |
|---|
| 83 | } |
|---|
| 84 | |
|---|
| 85 | // Check magic number |
|---|
| 86 | if(ntohl(hdr.mMagicValue) != OBJECTMAGIC_FILE_MAGIC_VALUE_V1 |
|---|
| 87 | #ifndef BOX_DISABLE_BACKWARDS_COMPATIBILITY_BACKUPSTOREFILE |
|---|
| 88 | && ntohl(hdr.mMagicValue) != OBJECTMAGIC_FILE_MAGIC_VALUE_V0 |
|---|
| 89 | #endif |
|---|
| 90 | ) |
|---|
| 91 | { |
|---|
| 92 | THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) |
|---|
| 93 | } |
|---|
| 94 | |
|---|
| 95 | // Work out where the index is |
|---|
| 96 | int64_t numBlocks = box_ntoh64(hdr.mNumBlocks); |
|---|
| 97 | int64_t blockHeaderPosFromEnd = ((numBlocks * sizeof(file_BlockIndexEntry)) + sizeof(file_BlockIndexHeader)); |
|---|
| 98 | |
|---|
| 99 | // Sanity check |
|---|
| 100 | if(blockHeaderPosFromEnd > static_cast<int64_t>(fileSize - sizeof(file_StreamFormat))) |
|---|
| 101 | { |
|---|
| 102 | THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) |
|---|
| 103 | } |
|---|
| 104 | |
|---|
| 105 | // Seek to that position |
|---|
| 106 | rStream.Seek(0 - blockHeaderPosFromEnd, IOStream::SeekType_End); |
|---|
| 107 | |
|---|
| 108 | // Done. Stream now in right position (as long as the file is formatted correctly) |
|---|
| 109 | } |
|---|
| 110 | |
|---|
| 111 | |
|---|
| 112 | // -------------------------------------------------------------------------- |
|---|
| 113 | // |
|---|
| 114 | // Function |
|---|
| 115 | // Name: BackupStoreFile::EncodeFileDiff(const char *, int64_t, const BackupStoreFilename &, int64_t, IOStream &, int64_t *) |
|---|
| 116 | // Purpose: Similar to EncodeFile, but takes the object ID of the file it's |
|---|
| 117 | // diffing from, and the index of the blocks in a stream. It'll then |
|---|
| 118 | // calculate which blocks can be reused from that old file. |
|---|
| 119 | // The timeout is the timeout value for reading the diff block index. |
|---|
| 120 | // If pIsCompletelyDifferent != 0, it will be set to true if the |
|---|
| 121 | // the two files are completely different (do not share any block), false otherwise. |
|---|
| 122 | // |
|---|
| 123 | // Created: 12/1/04 |
|---|
| 124 | // |
|---|
| 125 | // -------------------------------------------------------------------------- |
|---|
| 126 | std::auto_ptr<IOStream> BackupStoreFile::EncodeFileDiff |
|---|
| 127 | ( |
|---|
| 128 | const char *Filename, int64_t ContainerID, |
|---|
| 129 | const BackupStoreFilename &rStoreFilename, int64_t DiffFromObjectID, |
|---|
| 130 | IOStream &rDiffFromBlockIndex, int Timeout, DiffTimer *pDiffTimer, |
|---|
| 131 | int64_t *pModificationTime, bool *pIsCompletelyDifferent) |
|---|
| 132 | { |
|---|
| 133 | // Is it a symlink? |
|---|
| 134 | { |
|---|
| 135 | EMU_STRUCT_STAT st; |
|---|
| 136 | if(EMU_LSTAT(Filename, &st) != 0) |
|---|
| 137 | { |
|---|
| 138 | THROW_EXCEPTION(CommonException, OSFileError) |
|---|
| 139 | } |
|---|
| 140 | if((st.st_mode & S_IFLNK) == S_IFLNK) |
|---|
| 141 | { |
|---|
| 142 | // Don't do diffs for symlinks |
|---|
| 143 | if(pIsCompletelyDifferent != 0) |
|---|
| 144 | { |
|---|
| 145 | *pIsCompletelyDifferent = true; |
|---|
| 146 | } |
|---|
| 147 | return EncodeFile(Filename, ContainerID, rStoreFilename, pModificationTime); |
|---|
| 148 | } |
|---|
| 149 | } |
|---|
| 150 | |
|---|
| 151 | // Load in the blocks |
|---|
| 152 | BlocksAvailableEntry *pindex = 0; |
|---|
| 153 | int64_t blocksInIndex = 0; |
|---|
| 154 | bool canDiffFromThis = false; |
|---|
| 155 | LoadIndex(rDiffFromBlockIndex, DiffFromObjectID, &pindex, blocksInIndex, Timeout, canDiffFromThis); |
|---|
| 156 | // BOX_TRACE("Diff: Blocks in index: " << blocksInIndex); |
|---|
| 157 | |
|---|
| 158 | if(!canDiffFromThis) |
|---|
| 159 | { |
|---|
| 160 | // Don't do diffing... |
|---|
| 161 | if(pIsCompletelyDifferent != 0) |
|---|
| 162 | { |
|---|
| 163 | *pIsCompletelyDifferent = true; |
|---|
| 164 | } |
|---|
| 165 | return EncodeFile(Filename, ContainerID, rStoreFilename, pModificationTime); |
|---|
| 166 | } |
|---|
| 167 | |
|---|
| 168 | // Pointer to recipe we're going to create |
|---|
| 169 | BackupStoreFileEncodeStream::Recipe *precipe = 0; |
|---|
| 170 | |
|---|
| 171 | try |
|---|
| 172 | { |
|---|
| 173 | // Find which sizes should be scanned |
|---|
| 174 | int32_t sizesToScan[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; |
|---|
| 175 | FindMostUsedSizes(pindex, blocksInIndex, sizesToScan); |
|---|
| 176 | |
|---|
| 177 | // Flag for reporting to the user |
|---|
| 178 | bool completelyDifferent; |
|---|
| 179 | |
|---|
| 180 | // BLOCK |
|---|
| 181 | { |
|---|
| 182 | // Search the file to find matching blocks |
|---|
| 183 | std::map<int64_t, int64_t> foundBlocks; // map of offset in file to index in block index |
|---|
| 184 | int64_t sizeOfInputFile = 0; |
|---|
| 185 | // BLOCK |
|---|
| 186 | { |
|---|
| 187 | FileStream file(Filename); |
|---|
| 188 | // Get size of file |
|---|
| 189 | sizeOfInputFile = file.BytesLeftToRead(); |
|---|
| 190 | // Find all those lovely matching blocks |
|---|
| 191 | SearchForMatchingBlocks(file, foundBlocks, pindex, |
|---|
| 192 | blocksInIndex, sizesToScan, pDiffTimer); |
|---|
| 193 | |
|---|
| 194 | // Is it completely different? |
|---|
| 195 | completelyDifferent = (foundBlocks.size() == 0); |
|---|
| 196 | } |
|---|
| 197 | |
|---|
| 198 | // Create a recipe -- if the two files are completely different, don't put the from file ID in the recipe. |
|---|
| 199 | precipe = new BackupStoreFileEncodeStream::Recipe(pindex, blocksInIndex, completelyDifferent?(0):(DiffFromObjectID)); |
|---|
| 200 | BlocksAvailableEntry *pindexKeptRef = pindex; // we need this later, but must set pindex == 0 now, because of exceptions |
|---|
| 201 | pindex = 0; // Recipe now has ownership |
|---|
| 202 | |
|---|
| 203 | // Fill it in |
|---|
| 204 | GenerateRecipe(*precipe, pindexKeptRef, blocksInIndex, foundBlocks, sizeOfInputFile); |
|---|
| 205 | } |
|---|
| 206 | // foundBlocks no longer required |
|---|
| 207 | |
|---|
| 208 | // Create the stream |
|---|
| 209 | std::auto_ptr<IOStream> stream(new BackupStoreFileEncodeStream); |
|---|
| 210 | |
|---|
| 211 | // Do the initial setup |
|---|
| 212 | ((BackupStoreFileEncodeStream*)stream.get())->Setup(Filename, precipe, ContainerID, rStoreFilename, pModificationTime); |
|---|
| 213 | precipe = 0; // Stream has taken ownership of this |
|---|
| 214 | |
|---|
| 215 | // Tell user about completely different status? |
|---|
| 216 | if(pIsCompletelyDifferent != 0) |
|---|
| 217 | { |
|---|
| 218 | *pIsCompletelyDifferent = completelyDifferent; |
|---|
| 219 | } |
|---|
| 220 | |
|---|
| 221 | // Return the stream for the caller |
|---|
| 222 | return stream; |
|---|
| 223 | } |
|---|
| 224 | catch(...) |
|---|
| 225 | { |
|---|
| 226 | // cleanup |
|---|
| 227 | if(pindex != 0) |
|---|
| 228 | { |
|---|
| 229 | ::free(pindex); |
|---|
| 230 | pindex = 0; |
|---|
| 231 | } |
|---|
| 232 | if(precipe != 0) |
|---|
| 233 | { |
|---|
| 234 | delete precipe; |
|---|
| 235 | precipe = 0; |
|---|
| 236 | } |
|---|
| 237 | throw; |
|---|
| 238 | } |
|---|
| 239 | } |
|---|
| 240 | |
|---|
| 241 | // -------------------------------------------------------------------------- |
|---|
| 242 | // |
|---|
| 243 | // Function |
|---|
| 244 | // Name: static LoadIndex(IOStream &, int64_t, BlocksAvailableEntry **, int64_t, bool &) |
|---|
| 245 | // Purpose: Read in an index, and decrypt, and store in the in memory block format. |
|---|
| 246 | // rCanDiffFromThis is set to false if the version of the from file is too old. |
|---|
| 247 | // Created: 12/1/04 |
|---|
| 248 | // |
|---|
| 249 | // -------------------------------------------------------------------------- |
|---|
| 250 | static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntry **ppIndex, int64_t &rNumBlocksOut, int Timeout, bool &rCanDiffFromThis) |
|---|
| 251 | { |
|---|
| 252 | // Reset |
|---|
| 253 | rNumBlocksOut = 0; |
|---|
| 254 | rCanDiffFromThis = false; |
|---|
| 255 | |
|---|
| 256 | // Read header |
|---|
| 257 | file_BlockIndexHeader hdr; |
|---|
| 258 | if(!rBlockIndex.ReadFullBuffer(&hdr, sizeof(hdr), 0 /* not interested in bytes read if this fails */, Timeout)) |
|---|
| 259 | { |
|---|
| 260 | // Couldn't read header |
|---|
| 261 | THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream) |
|---|
| 262 | } |
|---|
| 263 | |
|---|
| 264 | #ifndef BOX_DISABLE_BACKWARDS_COMPATIBILITY_BACKUPSTOREFILE |
|---|
| 265 | // Check against backwards comptaibility stuff |
|---|
| 266 | if(hdr.mMagicValue == (int32_t)htonl(OBJECTMAGIC_FILE_BLOCKS_MAGIC_VALUE_V0)) |
|---|
| 267 | { |
|---|
| 268 | // Won't diff against old version |
|---|
| 269 | |
|---|
| 270 | // Absorb rest of stream |
|---|
| 271 | char buffer[2048]; |
|---|
| 272 | while(rBlockIndex.StreamDataLeft()) |
|---|
| 273 | { |
|---|
| 274 | rBlockIndex.Read(buffer, sizeof(buffer), 1000 /* 1 sec timeout */); |
|---|
| 275 | } |
|---|
| 276 | |
|---|
| 277 | // Tell caller |
|---|
| 278 | rCanDiffFromThis = false; |
|---|
| 279 | return; |
|---|
| 280 | } |
|---|
| 281 | #endif |
|---|
| 282 | |
|---|
| 283 | // Check magic |
|---|
| 284 | if(hdr.mMagicValue != (int32_t)htonl(OBJECTMAGIC_FILE_BLOCKS_MAGIC_VALUE_V1)) |
|---|
| 285 | { |
|---|
| 286 | THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) |
|---|
| 287 | } |
|---|
| 288 | |
|---|
| 289 | // Check that we're not trying to diff against a file which references blocks from another file |
|---|
| 290 | if(((int64_t)box_ntoh64(hdr.mOtherFileID)) != 0) |
|---|
| 291 | { |
|---|
| 292 | THROW_EXCEPTION(BackupStoreException, CannotDiffAnIncompleteStoreFile) |
|---|
| 293 | } |
|---|
| 294 | |
|---|
| 295 | // Mark as an acceptable diff. |
|---|
| 296 | rCanDiffFromThis = true; |
|---|
| 297 | |
|---|
| 298 | // Get basic information |
|---|
| 299 | int64_t numBlocks = box_ntoh64(hdr.mNumBlocks); |
|---|
| 300 | uint64_t entryIVBase = box_ntoh64(hdr.mEntryIVBase); |
|---|
| 301 | |
|---|
| 302 | //TODO: Verify that these sizes look reasonable |
|---|
| 303 | |
|---|
| 304 | // Allocate space for the index |
|---|
| 305 | BlocksAvailableEntry *pindex = (BlocksAvailableEntry*)::malloc(sizeof(BlocksAvailableEntry) * numBlocks); |
|---|
| 306 | if(pindex == 0) |
|---|
| 307 | { |
|---|
| 308 | throw std::bad_alloc(); |
|---|
| 309 | } |
|---|
| 310 | |
|---|
| 311 | try |
|---|
| 312 | { |
|---|
| 313 | for(int64_t b = 0; b < numBlocks; ++b) |
|---|
| 314 | { |
|---|
| 315 | // Read an entry from the stream |
|---|
| 316 | file_BlockIndexEntry entry; |
|---|
| 317 | if(!rBlockIndex.ReadFullBuffer(&entry, sizeof(entry), 0 /* not interested in bytes read if this fails */, Timeout)) |
|---|
| 318 | { |
|---|
| 319 | // Couldn't read entry |
|---|
| 320 | THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream) |
|---|
| 321 | } |
|---|
| 322 | |
|---|
| 323 | // Calculate IV for this entry |
|---|
| 324 | uint64_t iv = entryIVBase; |
|---|
| 325 | iv += b; |
|---|
| 326 | // Network byte order |
|---|
| 327 | iv = box_hton64(iv); |
|---|
| 328 | sBlowfishDecryptBlockEntry.SetIV(&iv); |
|---|
| 329 | |
|---|
| 330 | // Decrypt the encrypted section |
|---|
| 331 | file_BlockIndexEntryEnc entryEnc; |
|---|
| 332 | int sectionSize = sBlowfishDecryptBlockEntry.TransformBlock(&entryEnc, sizeof(entryEnc), |
|---|
| 333 | entry.mEnEnc, sizeof(entry.mEnEnc)); |
|---|
| 334 | if(sectionSize != sizeof(entryEnc)) |
|---|
| 335 | { |
|---|
| 336 | THROW_EXCEPTION(BackupStoreException, BlockEntryEncodingDidntGiveExpectedLength) |
|---|
| 337 | } |
|---|
| 338 | |
|---|
| 339 | // Check that we're not trying to diff against a file which references blocks from another file |
|---|
| 340 | if(((int64_t)box_ntoh64(entry.mEncodedSize)) <= 0) |
|---|
| 341 | { |
|---|
| 342 | THROW_EXCEPTION(BackupStoreException, CannotDiffAnIncompleteStoreFile) |
|---|
| 343 | } |
|---|
| 344 | |
|---|
| 345 | // Store all the required information |
|---|
| 346 | pindex[b].mpNextInHashList = 0; // hash list not set up yet |
|---|
| 347 | pindex[b].mSize = ntohl(entryEnc.mSize); |
|---|
| 348 | pindex[b].mWeakChecksum = ntohl(entryEnc.mWeakChecksum); |
|---|
| 349 | ::memcpy(pindex[b].mStrongChecksum, entryEnc.mStrongChecksum, sizeof(pindex[b].mStrongChecksum)); |
|---|
| 350 | } |
|---|
| 351 | |
|---|
| 352 | // Store index pointer for called |
|---|
| 353 | ASSERT(ppIndex != 0); |
|---|
| 354 | *ppIndex = pindex; |
|---|
| 355 | |
|---|
| 356 | // Store number of blocks for caller |
|---|
| 357 | rNumBlocksOut = numBlocks; |
|---|
| 358 | |
|---|
| 359 | } |
|---|
| 360 | catch(...) |
|---|
| 361 | { |
|---|
| 362 | // clean up and send the exception along its way |
|---|
| 363 | ::free(pindex); |
|---|
| 364 | throw; |
|---|
| 365 | } |
|---|
| 366 | } |
|---|
| 367 | |
|---|
| 368 | |
|---|
| 369 | // -------------------------------------------------------------------------- |
|---|
| 370 | // |
|---|
| 371 | // Function |
|---|
| 372 | // Name: static FindMostUsedSizes(BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]) |
|---|
| 373 | // Purpose: Finds the most commonly used block sizes in the index |
|---|
| 374 | // Created: 12/1/04 |
|---|
| 375 | // |
|---|
| 376 | // -------------------------------------------------------------------------- |
|---|
| 377 | static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]) |
|---|
| 378 | { |
|---|
| 379 | // Array for lengths |
|---|
| 380 | int64_t sizeCounts[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; |
|---|
| 381 | |
|---|
| 382 | // Set arrays to lots of zeros (= unused entries) |
|---|
| 383 | for(int l = 0; l < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++l) |
|---|
| 384 | { |
|---|
| 385 | Sizes[l] = 0; |
|---|
| 386 | sizeCounts[l] = 0; |
|---|
| 387 | } |
|---|
| 388 | |
|---|
| 389 | // Array for collecting sizes |
|---|
| 390 | std::map<int32_t, int64_t> foundSizes; |
|---|
| 391 | |
|---|
| 392 | // Run through blocks and make a count of the entries |
|---|
| 393 | for(int64_t b = 0; b < NumBlocks; ++b) |
|---|
| 394 | { |
|---|
| 395 | // Only if the block size is bigger than the minimum size we'll scan for |
|---|
| 396 | if(pIndex[b].mSize > BACKUP_FILE_DIFF_MIN_BLOCK_SIZE) |
|---|
| 397 | { |
|---|
| 398 | // Find entry? |
|---|
| 399 | std::map<int32_t, int64_t>::const_iterator f(foundSizes.find(pIndex[b].mSize)); |
|---|
| 400 | if(f != foundSizes.end()) |
|---|
| 401 | { |
|---|
| 402 | // Increment existing entry |
|---|
| 403 | foundSizes[pIndex[b].mSize] = foundSizes[pIndex[b].mSize] + 1; |
|---|
| 404 | } |
|---|
| 405 | else |
|---|
| 406 | { |
|---|
| 407 | // New entry |
|---|
| 408 | foundSizes[pIndex[b].mSize] = 1; |
|---|
| 409 | } |
|---|
| 410 | } |
|---|
| 411 | } |
|---|
| 412 | |
|---|
| 413 | // Make the block sizes |
|---|
| 414 | for(std::map<int32_t, int64_t>::const_iterator i(foundSizes.begin()); i != foundSizes.end(); ++i) |
|---|
| 415 | { |
|---|
| 416 | // Find the position of the size in the array |
|---|
| 417 | for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t) |
|---|
| 418 | { |
|---|
| 419 | // Instead of sorting on the raw count of blocks, |
|---|
| 420 | // take the file area covered by this block size. |
|---|
| 421 | if(i->second * i->first > sizeCounts[t] * Sizes[t]) |
|---|
| 422 | { |
|---|
| 423 | // Then this size belong before this entry -- shuffle them up |
|---|
| 424 | for(int s = (BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1); s >= t; --s) |
|---|
| 425 | { |
|---|
| 426 | Sizes[s] = Sizes[s-1]; |
|---|
| 427 | sizeCounts[s] = sizeCounts[s-1]; |
|---|
| 428 | } |
|---|
| 429 | |
|---|
| 430 | // Insert this size |
|---|
| 431 | Sizes[t] = i->first; |
|---|
| 432 | sizeCounts[t] = i->second; |
|---|
| 433 | |
|---|
| 434 | // Shouldn't do any more searching |
|---|
| 435 | break; |
|---|
| 436 | } |
|---|
| 437 | } |
|---|
| 438 | } |
|---|
| 439 | |
|---|
| 440 | // trace the size table in debug builds |
|---|
| 441 | #ifndef BOX_RELEASE_BUILD |
|---|
| 442 | if(BackupStoreFile::TraceDetailsOfDiffProcess) |
|---|
| 443 | { |
|---|
| 444 | for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t) |
|---|
| 445 | { |
|---|
| 446 | BOX_TRACE("Diff block size " << t << ": " << |
|---|
| 447 | Sizes[t] << " (count = " << |
|---|
| 448 | sizeCounts[t] << ")"); |
|---|
| 449 | } |
|---|
| 450 | } |
|---|
| 451 | #endif |
|---|
| 452 | } |
|---|
| 453 | |
|---|
| 454 | |
|---|
| 455 | |
|---|
| 456 | // -------------------------------------------------------------------------- |
|---|
| 457 | // |
|---|
| 458 | // Function |
|---|
| 459 | // Name: static SearchForMatchingBlocks(IOStream &, std::map<int64_t, int64_t> &, BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]) |
|---|
| 460 | // Purpose: Find the matching blocks within the file. |
|---|
| 461 | // Created: 12/1/04 |
|---|
| 462 | // |
|---|
| 463 | // -------------------------------------------------------------------------- |
|---|
| 464 | static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks, |
|---|
| 465 | BlocksAvailableEntry *pIndex, int64_t NumBlocks, |
|---|
| 466 | int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES], DiffTimer *pDiffTimer) |
|---|
| 467 | { |
|---|
| 468 | Timer maximumDiffingTime(0, "MaximumDiffingTime"); |
|---|
| 469 | |
|---|
| 470 | if(pDiffTimer && pDiffTimer->IsManaged()) |
|---|
| 471 | { |
|---|
| 472 | maximumDiffingTime = Timer(pDiffTimer->GetMaximumDiffingTime() * 1000, |
|---|
| 473 | "MaximumDiffingTime"); |
|---|
| 474 | } |
|---|
| 475 | |
|---|
| 476 | std::map<int64_t, int32_t> goodnessOfFit; |
|---|
| 477 | |
|---|
| 478 | // Allocate the hash lookup table |
|---|
| 479 | BlocksAvailableEntry **phashTable = (BlocksAvailableEntry **)::malloc(sizeof(BlocksAvailableEntry *) * (64*1024)); |
|---|
| 480 | |
|---|
| 481 | // Choose a size for the buffer, just a little bit more than the maximum block size |
|---|
| 482 | int32_t bufSize = Sizes[0]; |
|---|
| 483 | for(int z = 1; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z) |
|---|
| 484 | { |
|---|
| 485 | if(Sizes[z] > bufSize) bufSize = Sizes[z]; |
|---|
| 486 | } |
|---|
| 487 | bufSize += 4; |
|---|
| 488 | ASSERT(bufSize > Sizes[0]); |
|---|
| 489 | ASSERT(bufSize > 0); |
|---|
| 490 | if(bufSize > (BACKUP_FILE_MAX_BLOCK_SIZE + 1024)) |
|---|
| 491 | { |
|---|
| 492 | THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) |
|---|
| 493 | } |
|---|
| 494 | |
|---|
| 495 | // TODO: Because we read in the file a scanned block size at a time, |
|---|
| 496 | // it is likely to be inefficient. Probably will be much better to |
|---|
| 497 | // calculate checksums for all block sizes in a single pass. |
|---|
| 498 | |
|---|
| 499 | // Allocate the buffers. |
|---|
| 500 | uint8_t *pbuffer0 = (uint8_t *)::malloc(bufSize); |
|---|
| 501 | uint8_t *pbuffer1 = (uint8_t *)::malloc(bufSize); |
|---|
| 502 | try |
|---|
| 503 | { |
|---|
| 504 | // Check buffer allocation |
|---|
| 505 | if(pbuffer0 == 0 || pbuffer1 == 0 || phashTable == 0) |
|---|
| 506 | { |
|---|
| 507 | // If a buffer got allocated, it will be cleaned up in the catch block |
|---|
| 508 | throw std::bad_alloc(); |
|---|
| 509 | } |
|---|
| 510 | |
|---|
| 511 | // Flag to abort the run, if too many blocks are found -- avoid using |
|---|
| 512 | // huge amounts of processor time when files contain many similar blocks. |
|---|
| 513 | bool abortSearch = false; |
|---|
| 514 | |
|---|
| 515 | // Search for each block size in turn |
|---|
| 516 | // NOTE: Do the smallest size first, so that the scheme for adding |
|---|
| 517 | // entries in the found list works as expected and replaces smallers block |
|---|
| 518 | // with larger blocks when it finds matches at the same offset in the file. |
|---|
| 519 | for(int s = BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1; s >= 0; --s) |
|---|
| 520 | { |
|---|
| 521 | ASSERT(Sizes[s] <= bufSize); |
|---|
| 522 | BOX_TRACE("Diff pass " << s << ", for block size " << |
|---|
| 523 | Sizes[s]); |
|---|
| 524 | |
|---|
| 525 | // Check we haven't finished |
|---|
| 526 | if(Sizes[s] == 0) |
|---|
| 527 | { |
|---|
| 528 | // empty entry, try next size |
|---|
| 529 | continue; |
|---|
| 530 | } |
|---|
| 531 | |
|---|
| 532 | // Set up the hash table entries |
|---|
| 533 | SetupHashTable(pIndex, NumBlocks, Sizes[s], phashTable); |
|---|
| 534 | |
|---|
| 535 | // Shift file position to beginning |
|---|
| 536 | rFile.Seek(0, IOStream::SeekType_Absolute); |
|---|
| 537 | |
|---|
| 538 | // Read first block |
|---|
| 539 | if(rFile.Read(pbuffer0, Sizes[s]) != Sizes[s]) |
|---|
| 540 | { |
|---|
| 541 | // Size of file too short to match -- do next size |
|---|
| 542 | continue; |
|---|
| 543 | } |
|---|
| 544 | |
|---|
| 545 | // Setup block pointers |
|---|
| 546 | uint8_t *beginnings = pbuffer0; |
|---|
| 547 | uint8_t *endings = pbuffer1; |
|---|
| 548 | int offset = 0; |
|---|
| 549 | |
|---|
| 550 | // Calculate the first checksum, ready for rolling |
|---|
| 551 | RollingChecksum rolling(beginnings, Sizes[s]); |
|---|
| 552 | |
|---|
| 553 | // Then roll, until the file is exhausted |
|---|
| 554 | int64_t fileBlockNumber = 0; |
|---|
| 555 | int64_t fileOffset = 0; |
|---|
| 556 | int rollOverInitialBytes = 0; |
|---|
| 557 | while(true) |
|---|
| 558 | { |
|---|
| 559 | if(maximumDiffingTime.HasExpired()) |
|---|
| 560 | { |
|---|
| 561 | ASSERT(pDiffTimer != NULL); |
|---|
| 562 | BOX_INFO("MaximumDiffingTime reached - " |
|---|
| 563 | "suspending file diff"); |
|---|
| 564 | abortSearch = true; |
|---|
| 565 | break; |
|---|
| 566 | } |
|---|
| 567 | |
|---|
| 568 | if(pDiffTimer) |
|---|
| 569 | { |
|---|
| 570 | pDiffTimer->DoKeepAlive(); |
|---|
| 571 | } |
|---|
| 572 | |
|---|
| 573 | // Load in another block of data, and record how big it is |
|---|
| 574 | int bytesInEndings = rFile.Read(endings, Sizes[s]); |
|---|
| 575 | int tmp; |
|---|
| 576 | |
|---|
| 577 | // Skip any bytes from a previous matched block |
|---|
| 578 | if(rollOverInitialBytes > 0 && offset < bytesInEndings) |
|---|
| 579 | { |
|---|
| 580 | int spaceLeft = bytesInEndings - offset; |
|---|
| 581 | int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes; |
|---|
| 582 | |
|---|
| 583 | rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll); |
|---|
| 584 | |
|---|
| 585 | offset += thisRoll; |
|---|
| 586 | fileOffset += thisRoll; |
|---|
| 587 | rollOverInitialBytes -= thisRoll; |
|---|
| 588 | |
|---|
| 589 | if(rollOverInitialBytes) |
|---|
| 590 | { |
|---|
| 591 | goto refresh; |
|---|
| 592 | } |
|---|
| 593 | } |
|---|
| 594 | |
|---|
| 595 | if(goodnessOfFit.count(fileOffset)) |
|---|
| 596 | { |
|---|
| 597 | tmp = goodnessOfFit[fileOffset]; |
|---|
| 598 | } |
|---|
| 599 | else |
|---|
| 600 | { |
|---|
| 601 | tmp = 0; |
|---|
| 602 | } |
|---|
| 603 | |
|---|
| 604 | if(tmp >= Sizes[s]) |
|---|
| 605 | { |
|---|
| 606 | // Skip over bigger ready-matched blocks completely |
|---|
| 607 | rollOverInitialBytes = tmp; |
|---|
| 608 | int spaceLeft = bytesInEndings - offset; |
|---|
| 609 | int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes; |
|---|
| 610 | |
|---|
| 611 | rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll); |
|---|
| 612 | |
|---|
| 613 | offset += thisRoll; |
|---|
| 614 | fileOffset += thisRoll; |
|---|
| 615 | rollOverInitialBytes -= thisRoll; |
|---|
| 616 | |
|---|
| 617 | if(rollOverInitialBytes) |
|---|
| 618 | { |
|---|
| 619 | goto refresh; |
|---|
| 620 | } |
|---|
| 621 | } |
|---|
| 622 | |
|---|
| 623 | while(offset < bytesInEndings) |
|---|
| 624 | { |
|---|
| 625 | // Is current checksum in hash list? |
|---|
| 626 | uint16_t hash = rolling.GetComponentForHashing(); |
|---|
| 627 | if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s])) |
|---|
| 628 | { |
|---|
| 629 | if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks)) |
|---|
| 630 | { |
|---|
| 631 | BOX_TRACE("Found block match for " << hash << " of " << Sizes[s] << " bytes at offset " << fileOffset); |
|---|
| 632 | goodnessOfFit[fileOffset] = Sizes[s]; |
|---|
| 633 | |
|---|
| 634 | // Block matched, roll the checksum forward to the next block without doing |
|---|
| 635 | // any more comparisons, because these are pointless (as any more matches will be ignored when |
|---|
| 636 | // the recipe is generated) and just take up valuable processor time. Edge cases are |
|---|
| 637 | // especially nasty, using huge amounts of time and memory. |
|---|
| 638 | int skip = Sizes[s]; |
|---|
| 639 | if(offset < bytesInEndings && skip > 0) |
|---|
| 640 | { |
|---|
| 641 | int spaceLeft = bytesInEndings - offset; |
|---|
| 642 | int thisRoll = (skip > spaceLeft) ? spaceLeft : skip; |
|---|
| 643 | |
|---|
| 644 | rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll); |
|---|
| 645 | |
|---|
| 646 | offset += thisRoll; |
|---|
| 647 | fileOffset += thisRoll; |
|---|
| 648 | skip -= thisRoll; |
|---|
| 649 | } |
|---|
| 650 | // Not all the bytes necessary will have been skipped, so get them |
|---|
| 651 | // skipped after the next block is loaded. |
|---|
| 652 | rollOverInitialBytes = skip; |
|---|
| 653 | |
|---|
| 654 | // End this loop, so the final byte isn't used again |
|---|
| 655 | break; |
|---|
| 656 | } |
|---|
| 657 | else |
|---|
| 658 | { |
|---|
| 659 | BOX_TRACE("False alarm match for " << hash << " of " << Sizes[s] << " bytes at offset " << fileOffset); |
|---|
| 660 | } |
|---|
| 661 | |
|---|
| 662 | int64_t NumBlocksFound = static_cast<int64_t>( |
|---|
| 663 | rFoundBlocks.size()); |
|---|
| 664 | int64_t MaxBlocksFound = NumBlocks * |
|---|
| 665 | BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE; |
|---|
| 666 | |
|---|
| 667 | if(NumBlocksFound > MaxBlocksFound) |
|---|
| 668 | { |
|---|
| 669 | abortSearch = true; |
|---|
| 670 | break; |
|---|
| 671 | } |
|---|
| 672 | } |
|---|
| 673 | |
|---|
| 674 | // Roll checksum forward |
|---|
| 675 | rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]); |
|---|
| 676 | |
|---|
| 677 | // Increment offsets |
|---|
| 678 | ++offset; |
|---|
| 679 | ++fileOffset; |
|---|
| 680 | } |
|---|
| 681 | |
|---|
| 682 | if(abortSearch) break; |
|---|
| 683 | |
|---|
| 684 | refresh: |
|---|
| 685 | // Finished? |
|---|
| 686 | if(bytesInEndings != Sizes[s]) |
|---|
| 687 | { |
|---|
| 688 | // No more data in file -- check the final block |
|---|
| 689 | // (Do a copy and paste of 5 lines of code instead of introducing a comparison for |
|---|
| 690 | // each byte of the file) |
|---|
| 691 | uint16_t hash = rolling.GetComponentForHashing(); |
|---|
| 692 | if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s])) |
|---|
| 693 | { |
|---|
| 694 | if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks)) |
|---|
| 695 | { |
|---|
| 696 | goodnessOfFit[fileOffset] = Sizes[s]; |
|---|
| 697 | } |
|---|
| 698 | } |
|---|
| 699 | |
|---|
| 700 | // finish |
|---|
| 701 | break; |
|---|
| 702 | } |
|---|
| 703 | |
|---|
| 704 | // Switch buffers, reset offset |
|---|
| 705 | beginnings = endings; |
|---|
| 706 | endings = (beginnings == pbuffer0)?(pbuffer1):(pbuffer0); // ie the other buffer |
|---|
| 707 | offset = 0; |
|---|
| 708 | |
|---|
| 709 | // And count the blocks which have been done |
|---|
| 710 | ++fileBlockNumber; |
|---|
| 711 | } |
|---|
| 712 | |
|---|
| 713 | if(abortSearch) break; |
|---|
| 714 | } |
|---|
| 715 | |
|---|
| 716 | // Free buffers and hash table |
|---|
| 717 | ::free(pbuffer1); |
|---|
| 718 | pbuffer1 = 0; |
|---|
| 719 | ::free(pbuffer0); |
|---|
| 720 | pbuffer0 = 0; |
|---|
| 721 | ::free(phashTable); |
|---|
| 722 | phashTable = 0; |
|---|
| 723 | } |
|---|
| 724 | catch(...) |
|---|
| 725 | { |
|---|
| 726 | // Cleanup and throw |
|---|
| 727 | if(pbuffer1 != 0) ::free(pbuffer1); |
|---|
| 728 | if(pbuffer0 != 0) ::free(pbuffer0); |
|---|
| 729 | if(phashTable != 0) ::free(phashTable); |
|---|
| 730 | throw; |
|---|
| 731 | } |
|---|
| 732 | |
|---|
| 733 | #ifndef BOX_RELEASE_BUILD |
|---|
| 734 | if(BackupStoreFile::TraceDetailsOfDiffProcess) |
|---|
| 735 | { |
|---|
| 736 | // Trace out the found blocks in debug mode |
|---|
| 737 | BOX_TRACE("Diff: list of found blocks"); |
|---|
| 738 | BOX_TRACE("======== ======== ======== ========"); |
|---|
| 739 | BOX_TRACE(" Offset BlkIdx Size Movement"); |
|---|
| 740 | for(std::map<int64_t, int64_t>::const_iterator i(rFoundBlocks.begin()); i != rFoundBlocks.end(); ++i) |
|---|
| 741 | { |
|---|
| 742 | int64_t orgLoc = 0; |
|---|
| 743 | for(int64_t b = 0; b < i->second; ++b) |
|---|
| 744 | { |
|---|
| 745 | orgLoc += pIndex[b].mSize; |
|---|
| 746 | } |
|---|
| 747 | BOX_TRACE(std::setw(8) << i->first << " " << |
|---|
| 748 | std::setw(8) << i->second << " " << |
|---|
| 749 | std::setw(8) << pIndex[i->second].mSize << |
|---|
| 750 | " " << |
|---|
| 751 | std::setw(8) << (i->first - orgLoc)); |
|---|
| 752 | } |
|---|
| 753 | BOX_TRACE("======== ======== ======== ========"); |
|---|
| 754 | } |
|---|
| 755 | #endif |
|---|
| 756 | } |
|---|
| 757 | |
|---|
| 758 | |
|---|
| 759 | // -------------------------------------------------------------------------- |
|---|
| 760 | // |
|---|
| 761 | // Function |
|---|
| 762 | // Name: static SetupHashTable(BlocksAvailableEntry *, int64_t, in32_t, BlocksAvailableEntry **) |
|---|
| 763 | // Purpose: Set up the hash table ready for a scan |
|---|
| 764 | // Created: 14/1/04 |
|---|
| 765 | // |
|---|
| 766 | // -------------------------------------------------------------------------- |
|---|
| 767 | static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable) |
|---|
| 768 | { |
|---|
| 769 | // Set all entries in the hash table to zero |
|---|
| 770 | ::memset(pHashTable, 0, (sizeof(BlocksAvailableEntry *) * (64*1024))); |
|---|
| 771 | |
|---|
| 772 | // Scan through the blocks, building the hash table |
|---|
| 773 | for(int64_t b = 0; b < NumBlocks; ++b) |
|---|
| 774 | { |
|---|
| 775 | // Only look at the required block size |
|---|
| 776 | if(pIndex[b].mSize == BlockSize) |
|---|
| 777 | { |
|---|
| 778 | // Get the value under which to hash this entry |
|---|
| 779 | uint16_t hash = RollingChecksum::ExtractHashingComponent(pIndex[b].mWeakChecksum); |
|---|
| 780 | |
|---|
| 781 | // Already present in table? |
|---|
| 782 | if(pHashTable[hash] != 0) |
|---|
| 783 | { |
|---|
| 784 | //BOX_TRACE("Another hash entry for " << hash << " found"); |
|---|
| 785 | // Yes -- need to set the pointer in this entry to the current entry to build the linked list |
|---|
| 786 | pIndex[b].mpNextInHashList = pHashTable[hash]; |
|---|
| 787 | } |
|---|
| 788 | |
|---|
| 789 | // Put a pointer to this entry in the hash table |
|---|
| 790 | pHashTable[hash] = pIndex + b; |
|---|
| 791 | } |
|---|
| 792 | } |
|---|
| 793 | } |
|---|
| 794 | |
|---|
| 795 | |
|---|
| 796 | // -------------------------------------------------------------------------- |
|---|
| 797 | // |
|---|
| 798 | // Function |
|---|
| 799 | // Name: static bool SecondStageMatch(xxx) |
|---|
| 800 | // Purpose: When a match in the hash table is found, scan for second stage match using strong checksum. |
|---|
| 801 | // Created: 14/1/04 |
|---|
| 802 | // |
|---|
| 803 | // -------------------------------------------------------------------------- |
|---|
| 804 | static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, |
|---|
| 805 | int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks) |
|---|
| 806 | { |
|---|
| 807 | // Check parameters |
|---|
| 808 | ASSERT(pBeginnings != 0); |
|---|
| 809 | ASSERT(pEndings != 0); |
|---|
| 810 | ASSERT(Offset >= 0); |
|---|
| 811 | ASSERT(BlockSize > 0); |
|---|
| 812 | ASSERT(pFirstInHashList != 0); |
|---|
| 813 | ASSERT(pIndex != 0); |
|---|
| 814 | |
|---|
| 815 | #ifndef BOX_RELEASE_BUILD |
|---|
| 816 | uint16_t DEBUG_Hash = fastSum.GetComponentForHashing(); |
|---|
| 817 | #endif |
|---|
| 818 | uint32_t Checksum = fastSum.GetChecksum(); |
|---|
| 819 | |
|---|
| 820 | // Before we go to the expense of the MD5, make sure it's a darn good match on the checksum we already know. |
|---|
| 821 | BlocksAvailableEntry *scan = pFirstInHashList; |
|---|
| 822 | bool found=false; |
|---|
| 823 | while(scan != 0) |
|---|
| 824 | { |
|---|
| 825 | if(scan->mWeakChecksum == Checksum) |
|---|
| 826 | { |
|---|
| 827 | found = true; |
|---|
| 828 | break; |
|---|
| 829 | } |
|---|
| 830 | scan = scan->mpNextInHashList; |
|---|
| 831 | } |
|---|
| 832 | if(!found) |
|---|
| 833 | { |
|---|
| 834 | return false; |
|---|
| 835 | } |
|---|
| 836 | |
|---|
| 837 | // Calculate the strong MD5 digest for this block |
|---|
| 838 | MD5Digest strong; |
|---|
| 839 | // Add the data from the beginnings |
|---|
| 840 | strong.Add(pBeginnings + Offset, BlockSize - Offset); |
|---|
| 841 | // Add any data from the endings |
|---|
| 842 | if(Offset > 0) |
|---|
| 843 | { |
|---|
| 844 | strong.Add(pEndings, Offset); |
|---|
| 845 | } |
|---|
| 846 | strong.Finish(); |
|---|
| 847 | |
|---|
| 848 | // Then go through the entries in the hash list, comparing with the strong digest calculated |
|---|
| 849 | scan = pFirstInHashList; |
|---|
| 850 | //BOX_TRACE("second stage match"); |
|---|
| 851 | while(scan != 0) |
|---|
| 852 | { |
|---|
| 853 | //BOX_TRACE("scan size " << scan->mSize << |
|---|
| 854 | // ", block size " << BlockSize << |
|---|
| 855 | // ", hash " << Hash); |
|---|
| 856 | ASSERT(scan->mSize == BlockSize); |
|---|
| 857 | ASSERT(RollingChecksum::ExtractHashingComponent(scan->mWeakChecksum) == DEBUG_Hash); |
|---|
| 858 | |
|---|
| 859 | // Compare? |
|---|
| 860 | if(strong.DigestMatches(scan->mStrongChecksum)) |
|---|
| 861 | { |
|---|
| 862 | //BOX_TRACE("Match!\n"); |
|---|
| 863 | // Found! Add to list of found blocks... |
|---|
| 864 | int64_t fileOffset = (FileBlockNumber * BlockSize) + Offset; |
|---|
| 865 | int64_t blockIndex = (scan - pIndex); // pointer arthmitic is frowned upon. But most efficient way of doing it here -- alternative is to use more memory |
|---|
| 866 | |
|---|
| 867 | // We do NOT search for smallest blocks first, as this code originally assumed. |
|---|
| 868 | // To prevent this from potentially overwriting a better match, the caller must determine |
|---|
| 869 | // the relative "goodness" of any existing match and this one, and avoid the call if it |
|---|
| 870 | // could be detrimental. |
|---|
| 871 | rFoundBlocks[fileOffset] = blockIndex; |
|---|
| 872 | |
|---|
| 873 | // No point in searching further, report success |
|---|
| 874 | return true; |
|---|
| 875 | } |
|---|
| 876 | |
|---|
| 877 | // Next |
|---|
| 878 | scan = scan->mpNextInHashList; |
|---|
| 879 | } |
|---|
| 880 | |
|---|
| 881 | // Not matched |
|---|
| 882 | return false; |
|---|
| 883 | } |
|---|
| 884 | |
|---|
| 885 | |
|---|
| 886 | // -------------------------------------------------------------------------- |
|---|
| 887 | // |
|---|
| 888 | // Function |
|---|
| 889 | // Name: static GenerateRecipe(BackupStoreFileEncodeStream::Recipe &, BlocksAvailableEntry *, int64_t, std::map<int64_t, int64_t> &) |
|---|
| 890 | // Purpose: Fills in the recipe from the found block list |
|---|
| 891 | // Created: 15/1/04 |
|---|
| 892 | // |
|---|
| 893 | // -------------------------------------------------------------------------- |
|---|
| 894 | static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex, |
|---|
| 895 | int64_t NumBlocks, std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile) |
|---|
| 896 | { |
|---|
| 897 | // NOTE: This function could be a lot more sophisiticated. For example, if |
|---|
| 898 | // a small block overlaps a big block like this |
|---|
| 899 | // **** |
|---|
| 900 | // ******************************* |
|---|
| 901 | // then the small block will be used, not the big one. But it'd be better to |
|---|
| 902 | // just ignore the small block and keep the big one. However, some stats should |
|---|
| 903 | // be gathered about real world files before writing complex code which might |
|---|
| 904 | // go wrong. |
|---|
| 905 | |
|---|
| 906 | // Initialise a blank instruction |
|---|
| 907 | BackupStoreFileEncodeStream::RecipeInstruction instruction; |
|---|
| 908 | #define RESET_INSTRUCTION \ |
|---|
| 909 | instruction.mSpaceBefore = 0; \ |
|---|
| 910 | instruction.mBlocks = 0; \ |
|---|
| 911 | instruction.mpStartBlock = 0; |
|---|
| 912 | RESET_INSTRUCTION |
|---|
| 913 | |
|---|
| 914 | // First, a special case for when there are no found blocks |
|---|
| 915 | if(rFoundBlocks.size() == 0) |
|---|
| 916 | { |
|---|
| 917 | // No blocks, just a load of space |
|---|
| 918 | instruction.mSpaceBefore = SizeOfInputFile; |
|---|
| 919 | rRecipe.push_back(instruction); |
|---|
| 920 | |
|---|
| 921 | #ifndef BOX_RELEASE_BUILD |
|---|
| 922 | if(BackupStoreFile::TraceDetailsOfDiffProcess) |
|---|
| 923 | { |
|---|
| 924 | BOX_TRACE("Diff: Default recipe generated, " << |
|---|
| 925 | SizeOfInputFile << " bytes of file"); |
|---|
| 926 | } |
|---|
| 927 | #endif |
|---|
| 928 | |
|---|
| 929 | // Don't do anything |
|---|
| 930 | return; |
|---|
| 931 | } |
|---|
| 932 | |
|---|
| 933 | // Current location |
|---|
| 934 | int64_t loc = 0; |
|---|
| 935 | |
|---|
| 936 | // Then iterate through the list, generating the recipe |
|---|
| 937 | std::map<int64_t, int64_t>::const_iterator i(rFoundBlocks.begin()); |
|---|
| 938 | ASSERT(i != rFoundBlocks.end()); // check logic |
|---|
| 939 | |
|---|
| 940 | // Counting for debug tracing |
|---|
| 941 | #ifndef BOX_RELEASE_BUILD |
|---|
| 942 | int64_t debug_NewBytesFound = 0; |
|---|
| 943 | int64_t debug_OldBlocksUsed = 0; |
|---|
| 944 | #endif |
|---|
| 945 | |
|---|
| 946 | for(; i != rFoundBlocks.end(); ++i) |
|---|
| 947 | { |
|---|
| 948 | // Remember... map is (position in file) -> (index of block in pIndex) |
|---|
| 949 | |
|---|
| 950 | if(i->first < loc) |
|---|
| 951 | { |
|---|
| 952 | // This block overlaps the last one |
|---|
| 953 | continue; |
|---|
| 954 | } |
|---|
| 955 | else if(i->first > loc) |
|---|
| 956 | { |
|---|
| 957 | // There's a gap between the end of the last thing and this block. |
|---|
| 958 | // If there's an instruction waiting, push it onto the list |
|---|
| 959 | if(instruction.mSpaceBefore != 0 || instruction.mpStartBlock != 0) |
|---|
| 960 | { |
|---|
| 961 | rRecipe.push_back(instruction); |
|---|
| 962 | } |
|---|
| 963 | // Start a new instruction, with the gap ready |
|---|
| 964 | RESET_INSTRUCTION |
|---|
| 965 | instruction.mSpaceBefore = i->first - loc; |
|---|
| 966 | // Move location forward to match |
|---|
| 967 | loc += instruction.mSpaceBefore; |
|---|
| 968 | #ifndef BOX_RELEASE_BUILD |
|---|
| 969 | debug_NewBytesFound += instruction.mSpaceBefore; |
|---|
| 970 | #endif |
|---|
| 971 | } |
|---|
| 972 | |
|---|
| 973 | // First, does the current instruction need pushing back, because this block is not |
|---|
| 974 | // sequential to the last one? |
|---|
| 975 | if(instruction.mpStartBlock != 0 && (pIndex + i->second) != (instruction.mpStartBlock + instruction.mBlocks)) |
|---|
| 976 | { |
|---|
| 977 | rRecipe.push_back(instruction); |
|---|
| 978 | RESET_INSTRUCTION |
|---|
| 979 | } |
|---|
| 980 | |
|---|
| 981 | // Add in this block |
|---|
| 982 | if(instruction.mpStartBlock == 0) |
|---|
| 983 | { |
|---|
| 984 | // This block starts a new instruction |
|---|
| 985 | instruction.mpStartBlock = pIndex + i->second; |
|---|
| 986 | instruction.mBlocks = 1; |
|---|
| 987 | } |
|---|
| 988 | else |
|---|
| 989 | { |
|---|
| 990 | // It continues the previous section of blocks |
|---|
| 991 | instruction.mBlocks += 1; |
|---|
| 992 | } |
|---|
| 993 | |
|---|
| 994 | #ifndef BOX_RELEASE_BUILD |
|---|
| 995 | debug_OldBlocksUsed++; |
|---|
| 996 | #endif |
|---|
| 997 | |
|---|
| 998 | // Move location forward |
|---|
| 999 | loc += pIndex[i->second].mSize; |
|---|
| 1000 | } |
|---|
| 1001 | |
|---|
| 1002 | // Push the last instruction generated |
|---|
| 1003 | rRecipe.push_back(instruction); |
|---|
| 1004 | |
|---|
| 1005 | // Is there any space left at the end which needs sending? |
|---|
| 1006 | if(loc != SizeOfInputFile) |
|---|
| 1007 | { |
|---|
| 1008 | RESET_INSTRUCTION |
|---|
| 1009 | instruction.mSpaceBefore = SizeOfInputFile - loc; |
|---|
| 1010 | #ifndef BOX_RELEASE_BUILD |
|---|
| 1011 | debug_NewBytesFound += instruction.mSpaceBefore; |
|---|
| 1012 | #endif |
|---|
| 1013 | rRecipe.push_back(instruction); |
|---|
| 1014 | } |
|---|
| 1015 | |
|---|
| 1016 | // dump out the recipe |
|---|
| 1017 | #ifndef BOX_RELEASE_BUILD |
|---|
| 1018 | BOX_TRACE("Diff: " << |
|---|
| 1019 | debug_NewBytesFound << " new bytes found, " << |
|---|
| 1020 | debug_OldBlocksUsed << " old blocks used"); |
|---|
| 1021 | if(BackupStoreFile::TraceDetailsOfDiffProcess) |
|---|
| 1022 | { |
|---|
| 1023 | BOX_TRACE("Diff: Recipe generated (size " << rRecipe.size()); |
|---|
| 1024 | BOX_TRACE("======== ========= ========"); |
|---|
| 1025 | BOX_TRACE("Space b4 FirstBlk NumBlks"); |
|---|
| 1026 | { |
|---|
| 1027 | for(unsigned int e = 0; e < rRecipe.size(); ++e) |
|---|
| 1028 | { |
|---|
| 1029 | char b[64]; |
|---|
| 1030 | #ifdef WIN32 |
|---|
| 1031 | sprintf(b, "%8I64d", (int64_t)(rRecipe[e].mpStartBlock - pIndex)); |
|---|
| 1032 | #else |
|---|
| 1033 | sprintf(b, "%8lld", (int64_t)(rRecipe[e].mpStartBlock - pIndex)); |
|---|
| 1034 | #endif |
|---|
| 1035 | BOX_TRACE(std::setw(8) << |
|---|
| 1036 | rRecipe[e].mSpaceBefore << |
|---|
| 1037 | " " << |
|---|
| 1038 | ((rRecipe[e].mpStartBlock == 0)?" -":b) << |
|---|
| 1039 | " " << std::setw(8) << |
|---|
| 1040 | rRecipe[e].mBlocks); |
|---|
| 1041 | } |
|---|
| 1042 | } |
|---|
| 1043 | BOX_TRACE("======== ========= ========"); |
|---|
| 1044 | } |
|---|
| 1045 | #endif |
|---|
| 1046 | } |
|---|