Ticket #45: BackupFileDiff.patch
| File BackupFileDiff.patch, 46.7 KB (added by aharper, 4 years ago) |
|---|
-
lib/backupclient/BackupStoreFile.h
206 206 static void ResetStats(); 207 207 static BackupStoreFileStats msStats; 208 208 209 // For debug210 #ifndef NDEBUG211 static bool TraceDetailsOfDiffProcess;212 #endif213 214 209 // For decoding encoded files 215 210 static void DumpFile(void *clibFileHandle, bool ToTrace, IOStream &rFile); 216 211 }; -
test/backupdiff/testbackupdiff.cpp
376 376 // Want to trace out all the details 377 377 #ifndef NDEBUG 378 378 #ifndef WIN32 379 BackupStoreFile::TraceDetailsOfDiffProcess = true;379 Logging::SetGlobalLevel(Log::TRACE); 380 380 #endif 381 381 #endif 382 382 -
lib/backupclient/BackupStoreFileDiff.cpp
36 36 using namespace BackupStoreFileCryptVar; 37 37 using namespace BackupStoreFileCreation; 38 38 39 // By default, don't trace out details of the diff as we go along -- would fill up logs significantly.40 // But it's useful for the test.41 #ifndef NDEBUG42 bool BackupStoreFile::TraceDetailsOfDiffProcess = false;43 #endif44 45 39 static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntry **ppIndex, int64_t &rNumBlocksOut, int Timeout, bool &rCanDiffFromThis); 46 static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]); 47 static void SearchForMatchingBlocks(IOStream &rFile, 48 std::map<int64_t, int64_t> &rFoundBlocks, BlocksAvailableEntry *pIndex, 49 int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES], 50 DiffTimer *pDiffTimer); 40 static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks, BlocksAvailableEntry *pIndex, int64_t NumBlocks, DiffTimer *pDiffTimer); 51 41 static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable); 52 static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber, 53 BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks); 42 static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum *pFastSum, 43 uint8_t *pLow, int32_t LowSize, uint8_t *pHigh, int32_t HighSize, int32_t BlockSize, int64_t FileOffset, 44 BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks); 54 45 static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex, int64_t NumBlocks, std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile); 55 46 56 47 // -------------------------------------------------------------------------- … … 166 157 167 158 try 168 159 { 169 // Find which sizes should be scanned170 int32_t sizesToScan[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];171 FindMostUsedSizes(pindex, blocksInIndex, sizesToScan);172 173 160 // Flag for reporting to the user 174 161 bool completelyDifferent; 175 162 … … 184 171 // Get size of file 185 172 sizeOfInputFile = file.BytesLeftToRead(); 186 173 // Find all those lovely matching blocks 187 SearchForMatchingBlocks(file, foundBlocks, pindex, 188 blocksInIndex, sizesToScan, pDiffTimer); 174 SearchForMatchingBlocks(file, foundBlocks, pindex, blocksInIndex, pDiffTimer); 189 175 190 176 // Is it completely different? 191 177 completelyDifferent = (foundBlocks.size() == 0); … … 361 347 } 362 348 } 363 349 364 365 350 // -------------------------------------------------------------------------- 366 351 // 367 352 // Function 368 // Name: static FindMostUsedSizes(BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES])369 // Purpose: Find s the most commonly used block sizes in the index353 // Name: static SearchForMatchingBlocks(IOStream &, std::map<int64_t, int64_t> &, BlocksAvailableEntry *, int64_t, DiffTimer *) 354 // Purpose: Find the matching blocks within the file. 370 355 // Created: 12/1/04 371 356 // 372 357 // -------------------------------------------------------------------------- 373 static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]) 358 static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks, 359 BlocksAvailableEntry *pIndex, int64_t NumBlocks, DiffTimer *pDiffTimer) 374 360 { 375 // Array for lengths 376 int64_t sizeCounts[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; 361 Timer maximumDiffingTime(0); 377 362 378 // Set arrays to lots of zeros (= unused entries) 379 for(int l = 0; l < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++l) 363 if(pDiffTimer && pDiffTimer->IsManaged()) 380 364 { 381 Sizes[l] = 0; 382 sizeCounts[l] = 0; 365 maximumDiffingTime = Timer(pDiffTimer->GetMaximumDiffingTime()); 383 366 } 384 367 385 // Array for collecting sizes 386 std::map<int32_t, int64_t> foundSizes; 368 // Flag to abort the run, if too many blocks are found or the diffing 369 // timeout expires 370 bool abortSearch = false; 371 372 // Buffers used during both phases of search 373 uint8_t *pbuffer0 = 0; 374 uint8_t *pbuffer1 = 0; 375 376 // Track offsets that already have block matches. We don't really care 377 // if its sorted, and this actually produces a performance issue, so 378 // see pfitBitmap for a workaround 379 std::map<int64_t, int32_t> goodnessOfFit; 387 380 388 // Run through blocks and make a count of the entries 389 for(int64_t b = 0; b < NumBlocks; ++b) 381 // Collect sizes that aren't found in the file at their old offset 382 std::map<int32_t, int64_t> unmatchedSizes; 383 384 // Our arrays of block sizes during second search pass 385 int32_t scanSizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; 386 int64_t scanSizesCount[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; 387 int32_t maxScanBlockSize = 0; 388 ::memset(scanSizes, 0, (sizeof(int32_t) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES)); 389 ::memset(scanSizesCount, 0, (sizeof(int64_t) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES)); 390 391 // We need to keep separate rolling checksums for each block size in second search 392 RollingChecksum *rollingSums[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; 393 ::memset(rollingSums, 0, (sizeof(RollingChecksum *) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES)); 394 395 // And a hash lookup table per block size in seacond search 396 BlocksAvailableEntry **hashTables[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; 397 ::memset(hashTables, 0, (sizeof(BlocksAvailableEntry **) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES)); 398 399 // We allow second search pass to short-circuit off rare blocks 400 bool scanThisSize[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; 401 ::memset(scanThisSize, 0, (sizeof(bool) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES)); 402 403 // During block read we have a bitmap of prefit locations to avoid std::map 404 // performance problems 405 uint8_t *pfitBitmap = 0; 406 407 408 // First search pass... 409 // 410 // For many files (especially large ones) most of the file is unchanged. 411 // The RollingChecksum process requires us to read every byte of the file looking for 412 // blocks that have moved. However, we can make that process more efficient by 413 // quickly rolling over areas that match a different block. We can also use 414 // this to eliminate entirely the rolling checksum for block sizes that only exist 415 // at one location in the file. 416 // 417 // Thus we start by looking for blocks that have not moved. Only if a block 418 // cannot be found at its previous location do we consider scanning for it by 419 // rolling checksum size. 420 // 421 // This strategy has some disadvantages for files with lots of repeating content 422 // that happens to align with our block size, but the reduction in diff time 423 // for more typical files is worth it. 424 // 425 // Note also that in this pass we consider _all_ block sizes (smaller than 426 // BACKUP_FILE_MAX_BLOCK_SIZE). Any block size, no matter how small or rare 427 // is cheap for us to find in this pass. 428 // 429 pbuffer0 = (uint8_t *)::malloc(BACKUP_FILE_MAX_BLOCK_SIZE); 430 try 390 431 { 391 // Only if the block size is bigger than the minimum size we'll scan for 392 if(pIndex[b].mSize > BACKUP_FILE_DIFF_MIN_BLOCK_SIZE) 432 if(pbuffer0 == 0) 433 { 434 throw std::bad_alloc(); 435 } 436 437 // We have to track file offset since the read may fail 438 int64_t fileOffset = 0; 439 440 // Walk the blocks 441 for(int64_t b = 0; b < NumBlocks; ++b) 393 442 { 394 // Find entry? 395 std::map<int32_t, int64_t>::const_iterator f(foundSizes.find(pIndex[b].mSize)); 396 if(f != foundSizes.end()) 443 // Check diffing timeout 444 if(maximumDiffingTime.HasExpired()) 445 { 446 ASSERT(pDiffTimer != NULL); 447 BOX_INFO("MaximumDiffingTime reached - suspending file diff"); 448 abortSearch = true; 449 break; 450 } 451 452 // Send keep alive 453 if(pDiffTimer) pDiffTimer->DoKeepAlive(); 454 455 // Skip blocks too large for our buffer 456 if(pIndex[b].mSize > BACKUP_FILE_MAX_BLOCK_SIZE) { 457 fileOffset += pIndex[b].mSize; 458 continue; 459 } 460 461 // Have to guard the seek operation, it could throw. We don't know size of the 462 // current file, and checking is pointless anyway since there's a race. 463 // In reality on Unix this is implemented with lseek which will mean we can 464 // seek past the EOF, but I don't want to make assumptions about Win32. 465 int32_t readSize = 0; 466 try 467 { 468 rFile.Seek(fileOffset, IOStream::SeekType_Absolute); 469 readSize = rFile.Read(pbuffer0, pIndex[b].mSize); 470 } 471 catch(BoxException &e) 472 { 473 if(e.GetType() != CommonException::ExceptionType || e.GetSubType() != CommonException::OSFileError) 474 { 475 // Not what we expected, rethrow 476 throw; 477 } 478 } 479 480 // Check for a match 481 bool blockMatched = false; 482 if(readSize == pIndex[b].mSize) 397 483 { 398 // Increment existing entry 399 foundSizes[pIndex[b].mSize] = foundSizes[pIndex[b].mSize] + 1; 484 // We don't have a rolling checksum to this point, so all we can do is MD5. If you 485 // worry this is expensive just remember that prior versions of this code 486 // re-read the file BACKUP_FILE_DIFF_MAX_BLOCK_SIZES times and calculated 487 // rolling checksums every time. 488 MD5Digest strong; 489 strong.Add(pbuffer0, pIndex[b].mSize); 490 strong.Finish(); 491 492 // Do we have a match? 493 if(strong.DigestMatches(pIndex[b].mStrongChecksum)) 494 { 495 #ifndef NDEBUG 496 BOX_TRACE("Found unchanged block of size " << pIndex[b].mSize << " at offset " << fileOffset); 497 #endif 498 rFoundBlocks[fileOffset] = b; 499 goodnessOfFit[fileOffset] = pIndex[b].mSize; 500 blockMatched = true; 501 } 400 502 } 401 else 503 504 // We're done with the file offset, so increment now 505 fileOffset += pIndex[b].mSize; 506 507 // If the block didn't match then this is a size we'll have to scan for 508 if(!blockMatched && (pIndex[b].mSize >= BACKUP_FILE_DIFF_MIN_BLOCK_SIZE)) 402 509 { 403 // New entry 404 foundSizes[pIndex[b].mSize] = 1; 510 // Find entry? 511 std::map<int32_t, int64_t>::const_iterator f(unmatchedSizes.find(pIndex[b].mSize)); 512 if(f != unmatchedSizes.end()) 513 { 514 // Increment existing entry 515 unmatchedSizes[pIndex[b].mSize] = unmatchedSizes[pIndex[b].mSize] + 1; 516 } 517 else 518 { 519 // New entry 520 unmatchedSizes[pIndex[b].mSize] = 1; 521 } 405 522 } 406 523 } 524 525 // Cleanup 526 ::free(pbuffer0); 527 pbuffer0 = 0; 407 528 } 408 409 // Make the block sizes 410 for(std::map<int32_t, int64_t>::const_iterator i(foundSizes.begin()); i != foundSizes.end(); ++i) 529 catch(...) 530 { 531 // Cleanup and rethrow 532 if(pbuffer0 != 0) ::free(pbuffer0); 533 throw; 534 } 535 536 // Are we already out of time? 537 if(abortSearch) 411 538 { 412 // Find the position of the size in the array 539 #ifndef NDEBUG 540 goto dumpDiffList; 541 #endif 542 return; 543 } 544 545 546 // Second search pass... 547 // 548 // In our second phase, having matched all unchanged blocks we now need 549 // to scan for moved blocks. This involves looping across all unmatched 550 // block sizes and using the rolling checksum to look for relocations. To keep 551 // this from being too expensive we cap at BACKUP_FILE_DIFF_MAX_BLOCK_SIZES 552 // for the number of sizes to scan. We also scan for the blocks in order of 553 // their relative probability, a block size that occurs frequently is scanned 554 // first. 555 // 556 557 // Loop all sizes inserting higher usages into the array 558 for(std::map<int32_t, int64_t>::const_iterator i(unmatchedSizes.begin()); i != unmatchedSizes.end(); ++i) 559 { 560 // TODO: Scanning for any block size isn't cheap, and realistically in many cases 561 // it would be less expensive to upload a few thousand bytes rather than do the scan. 562 // Here would be a good place to filter block sizes that aren't worth the effort 563 // once a suitable set of heuristics is found. 564 413 565 for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t) 414 566 { 415 // Instead of sorting on the raw count of blocks, 416 // take the file area covered by this block size. 417 if(i->second * i->first > sizeCounts[t] * Sizes[t]) 567 // Instead of sorting on the raw count of blocks, take the file area covered by this 568 // block size. This helps avoid favoring low numbers of large blocks over many 569 // small blocks. 570 if((i->second * i->first) > (scanSizesCount[t] * scanSizes[t])) 418 571 { 419 572 // Then this size belong before this entry -- shuffle them up 420 573 for(int s = (BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1); s >= t; --s) 421 574 { 422 Sizes[s] =Sizes[s-1];423 s izeCounts[s] = sizeCounts[s-1];575 scanSizes[s] = scanSizes[s-1]; 576 scanSizesCount[s] = scanSizesCount[s-1]; 424 577 } 425 578 426 579 // Insert this size 427 Sizes[t] = i->first;428 s izeCounts[t] = i->second;580 scanSizes[t] = i->first; 581 scanSizesCount[t] = i->second; 429 582 430 // Shouldn't do any more searching 583 // Update max size 584 if(maxScanBlockSize < i->first) maxScanBlockSize = i->first; 585 586 // Shouldn't do any more searching for this size 431 587 break; 432 588 } 433 589 } 434 590 } 435 436 // trace the size table in debug builds 591 // Dump the size table in debug builds 437 592 #ifndef NDEBUG 438 if(BackupStoreFile::TraceDetailsOfDiffProcess)593 for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t) 439 594 { 440 for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t) 441 { 442 TRACE3("Diff block size %d: %d (count = %lld)\n", t, Sizes[t], sizeCounts[t]); 443 } 595 if(scanSizes[t] != 0) BOX_TRACE("Scan block size " << t << ": " << scanSizes[t] << " count: " << scanSizesCount[t]); 444 596 } 445 597 #endif 446 }447 448 449 450 // --------------------------------------------------------------------------451 //452 // Function453 // Name: static SearchForMatchingBlocks(IOStream &, std::map<int64_t, int64_t> &, BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES])454 // Purpose: Find the matching blocks within the file.455 // Created: 12/1/04456 //457 // --------------------------------------------------------------------------458 static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks,459 BlocksAvailableEntry *pIndex, int64_t NumBlocks,460 int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES], DiffTimer *pDiffTimer)461 {462 Timer maximumDiffingTime(0);463 598 464 if(pDiffTimer && pDiffTimer->IsManaged()) 599 // If we didn't find any sizes (could happen if all were found in the first matching 600 // phase) we're done. 601 if(maxScanBlockSize == 0) 465 602 { 466 maximumDiffingTime = Timer(pDiffTimer->GetMaximumDiffingTime()); 603 #ifndef NDEBUG 604 BOX_TRACE("No scan block sizes, skip rolling checksum"); 605 goto dumpDiffList; 606 #endif 607 return; 467 608 } 609 ASSERT(maxScanBlockSize <= BACKUP_FILE_MAX_BLOCK_SIZE); 468 610 469 std::map<int64_t, int32_t> goodnessOfFit; 470 471 // Allocate the hash lookup table 472 BlocksAvailableEntry **phashTable = (BlocksAvailableEntry **)::malloc(sizeof(BlocksAvailableEntry *) * (64*1024)); 473 474 // Choose a size for the buffer, just a little bit more than the maximum block size 475 int32_t bufSize = Sizes[0]; 476 for(int z = 1; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z) 477 { 478 if(Sizes[z] > bufSize) bufSize = Sizes[z]; 479 } 480 bufSize += 4; 481 ASSERT(bufSize > Sizes[0]); 482 ASSERT(bufSize > 0); 483 if(bufSize > (BACKUP_FILE_MAX_BLOCK_SIZE + 1024)) 484 { 485 THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) 486 } 611 // Allocate two buffers we'll toggle between at the max scan block size 612 // There sizes are doubled to make final block at the end easier (we're cheating) 613 pbuffer0 = (uint8_t *)::malloc(maxScanBlockSize * 2); 614 pbuffer1 = (uint8_t *)::malloc(maxScanBlockSize * 2); 615 616 // Allocate a bitmap buffer to optimize goodnessOfFit access 617 pfitBitmap = (uint8_t *)::malloc(maxScanBlockSize * 2 / 8); 487 618 488 // TODO: Because we read in the file a scanned block size at a time,489 // it is likely to be inefficient. Probably will be much better to490 // calculate checksums for all block sizes in a single pass.491 492 // Allocate the buffers.493 uint8_t *pbuffer0 = (uint8_t *)::malloc(bufSize);494 uint8_t *pbuffer1 = (uint8_t *)::malloc(bufSize);495 619 try 496 620 { 497 621 // Check buffer allocation 498 if(pbuffer0 == 0 || pbuffer1 == 0 || p hashTable== 0)622 if(pbuffer0 == 0 || pbuffer1 == 0 || pfitBitmap == 0) 499 623 { 500 624 // If a buffer got allocated, it will be cleaned up in the catch block 501 625 throw std::bad_alloc(); 502 626 } 503 627 504 // Flag to abort the run, if too many blocks are found -- avoid using 505 // huge amounts of processor time when files contain many similar blocks. 506 bool abortSearch = false; 507 508 // Search for each block size in turn 509 // NOTE: Do the smallest size first, so that the scheme for adding 510 // entries in the found list works as expected and replaces smallers block 511 // with larger blocks when it finds matches at the same offset in the file. 512 for(int s = BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1; s >= 0; --s) 513 { 514 ASSERT(Sizes[s] <= bufSize); 515 BOX_TRACE("Diff pass " << s << ", for block size " << 516 Sizes[s]); 517 518 // Check we haven't finished 519 if(Sizes[s] == 0) 520 { 521 // empty entry, try next size 522 continue; 523 } 524 525 // Set up the hash table entries 526 SetupHashTable(pIndex, NumBlocks, Sizes[s], phashTable); 628 // Shift file position back to beginning 629 int64_t bufferFileOffset = 0; 630 rFile.Seek(0, IOStream::SeekType_Absolute); 527 631 528 // Shift file position to beginning 529 rFile.Seek(0, IOStream::SeekType_Absolute); 530 531 // Read first block 532 if(rFile.Read(pbuffer0, Sizes[s]) != Sizes[s]) 632 // We're going to be flipping back and forth between two buffers, the low and high 633 uint8_t *lowBuffer = pbuffer0; 634 int32_t lowBufferBytes = 0; 635 uint8_t *highBuffer = pbuffer1; 636 int32_t highBufferBytes = 0; 637 638 // In some cases we need to carry over reads from prior buffers 639 int32_t carryOverBytes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; 640 ::memset(carryOverBytes, 0, (sizeof(int32_t) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES)); 641 642 // Read the first buffer's worth of data 643 lowBufferBytes = rFile.Read(lowBuffer, maxScanBlockSize); 644 // Fill the second buffer if appropriate 645 if(lowBufferBytes == maxScanBlockSize) 646 { 647 highBufferBytes = rFile.Read(highBuffer, maxScanBlockSize); 648 } 649 650 // For every block size, initialize our scan tracking 651 for(int z = 0; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z) 652 { 653 ASSERT(scanSizes[z] <= maxScanBlockSize); 654 655 // The sizes array may be mostly empty, in those cases we have no 656 // state to maintain. 657 if(scanSizes[z] != 0) 533 658 { 534 // Size of file too short to match -- do next size 535 continue; 536 } 537 538 // Setup block pointers 539 uint8_t *beginnings = pbuffer0; 540 uint8_t *endings = pbuffer1; 541 int offset = 0; 542 543 // Calculate the first checksum, ready for rolling 544 RollingChecksum rolling(beginnings, Sizes[s]); 545 546 // Then roll, until the file is exhausted 547 int64_t fileBlockNumber = 0; 548 int64_t fileOffset = 0; 549 int rollOverInitialBytes = 0; 550 while(true) 551 { 552 if(maximumDiffingTime.HasExpired()) 553 { 554 ASSERT(pDiffTimer != NULL); 555 BOX_INFO("MaximumDiffingTime reached - " 556 "suspending file diff"); 557 abortSearch = true; 558 break; 559 } 659 // Mark for scan 660 scanThisSize[z] = true; 560 661 561 if(pDiffTimer) 662 // Set up the hash table for this size 663 hashTables[z] = (BlocksAvailableEntry **)::malloc(sizeof(BlocksAvailableEntry *) * (64*1024)); 664 if(hashTables[z] == 0) 562 665 { 563 pDiffTimer->DoKeepAlive();666 throw std::bad_alloc(); 564 667 } 565 566 // Load in another block of data, and record how big it is 567 int bytesInEndings = rFile.Read(endings, Sizes[s]); 568 int tmp; 668 SetupHashTable(pIndex, NumBlocks, scanSizes[z], hashTables[z]); 569 669 570 // Skip any bytes from a previous matched block 571 if(rollOverInitialBytes > 0 && offset < bytesInEndings) 670 // Set up a rolling checksum, but only if there is enough data to start with 671 // (file may now be shorter than some block sizes previously used) 672 if(scanSizes[z] <= lowBufferBytes) 572 673 { 573 int spaceLeft = bytesInEndings - offset; 574 int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes; 674 rollingSums[z] = new RollingChecksum(lowBuffer, scanSizes[z]); 675 } 676 } 677 } 678 679 // Read loop while we can get full maxScanBlockSize reads 680 while(highBufferBytes == maxScanBlockSize) 681 { 682 // Oh happy day! We have maxScanBlockSize * 2 bytes available to us across 683 // the two buffers, which means we can walk every block size across all offsets 684 // in lowBuffer without any concern about overrunning the data available in highBuffer. 575 685 576 rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll); 686 // Check diffing timeout 687 if(maximumDiffingTime.HasExpired()) 688 { 689 ASSERT(pDiffTimer != NULL); 690 BOX_INFO("MaximumDiffingTime reached - suspending file diff"); 691 abortSearch = true; 692 break; 693 } 694 695 // Send keep alive 696 if(pDiffTimer) pDiffTimer->DoKeepAlive(); 697 698 // Don't you wish hash_map was standard? std::map is very slow 699 // when we access it as often as we do in the loop below. To work around 700 // we'll create a bitmap of the previous fits in this buffer 701 ::memset(pfitBitmap, 0, maxScanBlockSize * 2 / 8); 702 for(std::map<int64_t, int32_t>::const_iterator i(goodnessOfFit.lower_bound(bufferFileOffset)); i != goodnessOfFit.end(); ++i) 703 { 704 if(i->first >= (bufferFileOffset + maxScanBlockSize)) break; 705 pfitBitmap[(i->first - bufferFileOffset) >> 3] |= (1 << ((i->first - bufferFileOffset) & 0x7)); 706 } 707 708 // Walk all block sizes in block-probability order 709 for(int s = 0; s < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++s) 710 { 711 // If there is no rolling checksum at this index, skip its either 712 // a zero size or the file was too small 713 if(!scanThisSize[s] || (rollingSums[s] == 0)) continue; 577 714 578 offset += thisRoll; 579 fileOffset += thisRoll; 580 rollOverInitialBytes -= thisRoll; 715 #ifndef NDEBUG 716 BOX_TRACE("Diff block size " << scanSizes[s] << " at file offset " << bufferFileOffset); 717 #endif 718 719 // Offset of this block buffer 720 int32_t bufferOffset = 0; 581 721 582 if(rollOverInitialBytes) 722 // Roll carry over 723 if(carryOverBytes[s] != 0) 724 { 725 ASSERT(carryOverBytes[s] > 0); 726 // Carry can be bigger than maxScanBlockSize because first matching 727 // phase might have used bigger blocks than the current max scan size 728 int32_t thisCarry = carryOverBytes[s]; 729 if(thisCarry > maxScanBlockSize) 583 730 { 584 goto refresh;731 thisCarry = maxScanBlockSize; 585 732 } 586 }587 588 if(goodnessOfFit.count(fileOffset))589 {590 tmp = goodnessOfFit[fileOffset];591 }592 else593 {594 tmp = 0;595 }596 733 597 if(tmp >= Sizes[s]) 598 { 599 // Skip over bigger ready-matched blocks completely 600 rollOverInitialBytes = tmp; 601 int spaceLeft = bytesInEndings - offset; 602 int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes; 603 604 rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll); 605 606 offset += thisRoll; 607 fileOffset += thisRoll; 608 rollOverInitialBytes -= thisRoll; 609 610 if(rollOverInitialBytes) 734 // Perform carry 735 if((thisCarry + scanSizes[s]) > maxScanBlockSize) 736 { 737 // Roll is split some low, some low/high 738 int32_t lowRollBytes = maxScanBlockSize - scanSizes[s]; 739 ASSERT(lowRollBytes >= 0); 740 if(lowRollBytes > 0) 741 { 742 rollingSums[s]->RollForwardSeveral(lowBuffer, lowBuffer + scanSizes[s], scanSizes[s], lowRollBytes); 743 } 744 rollingSums[s]->RollForwardSeveral(lowBuffer + lowRollBytes, highBuffer, scanSizes[s], thisCarry - lowRollBytes); 745 } 746 else 611 747 { 612 goto refresh; 748 // Roll is all in low buffer 749 rollingSums[s]->RollForwardSeveral(lowBuffer, lowBuffer + scanSizes[s], scanSizes[s], thisCarry); 613 750 } 751 // Either way offset is carry 752 bufferOffset = thisCarry; 753 // Reuce carry by the carry amount actually carried 754 carryOverBytes[s] -= thisCarry; 755 ASSERT(carryOverBytes[s] >= 0); 614 756 } 615 616 while(offset < bytesInEndings) 757 758 // Loop remaining low buffer bytes, taking a checksum at each offset 759 while (bufferOffset < maxScanBlockSize) 617 760 { 618 // Is current checksum in hash list?619 uint16_t hash = rolling.GetComponentForHashing();620 if(p hashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s]))761 // Look for larger size matches at this offset 762 int32_t bestFitSoFarSize = 0; 763 if(pfitBitmap[bufferOffset >> 3] & (1 << (bufferOffset & 0x7))) 621 764 { 622 if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks)) 765 bestFitSoFarSize = goodnessOfFit[bufferFileOffset + bufferOffset]; 766 } 767 if(bestFitSoFarSize >= scanSizes[s]) { 768 // Roll can be bigger than maxScanBlockSize because first matching 769 // phase might have used bigger blocks than the max scan size 770 int32_t thisRoll = bestFitSoFarSize; 771 int32_t remainderRoll = 0; 772 if(thisRoll > maxScanBlockSize) 623 773 { 624 BOX_TRACE("Found block match for " << hash << " of " << Sizes[s] << " bytes at offset " << fileOffset); 625 goodnessOfFit[fileOffset] = Sizes[s]; 626 627 // Block matched, roll the checksum forward to the next block without doing 628 // any more comparisons, because these are pointless (as any more matches will be ignored when 629 // the receipe is generated) and just take up valuable processor time. Edge cases are 630 // especially nasty, using huge amounts of time and memory. 631 int skip = Sizes[s]; 632 if(offset < bytesInEndings && skip > 0) 633 { 634 int spaceLeft = bytesInEndings - offset; 635 int thisRoll = (skip > spaceLeft) ? spaceLeft : skip; 636 637 rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll); 638 639 offset += thisRoll; 640 fileOffset += thisRoll; 641 skip -= thisRoll; 642 } 643 // Not all the bytes necessary will have been skipped, so get them 644 // skipped after the next block is loaded. 645 rollOverInitialBytes = skip; 646 647 // End this loop, so the final byte isn't used again 648 break; 774 thisRoll = maxScanBlockSize; 775 remainderRoll = bestFitSoFarSize - thisRoll; 649 776 } 650 else 777 778 // Roll forward in the lower buffer. This will either exhaust the total roll or will 779 // push the rolling checksum so that it is against the end of the buffer. 780 int32_t lowRoll = ((maxScanBlockSize - bufferOffset - scanSizes[s]) > thisRoll) ? thisRoll : (maxScanBlockSize - bufferOffset - scanSizes[s]); 781 if(lowRoll > 0) 651 782 { 652 BOX_TRACE("False alarm match for " << hash << " of " << Sizes[s] << " bytes at offset " << fileOffset); 783 rollingSums[s]->RollForwardSeveral(lowBuffer + bufferOffset, lowBuffer + bufferOffset + scanSizes[s], scanSizes[s], lowRoll); 784 bufferOffset += lowRoll; 785 thisRoll -= lowRoll; 653 786 } 654 655 int64_t NumBlocksFound = static_cast<int64_t>( 656 rFoundBlocks.size()); 657 int64_t MaxBlocksFound = NumBlocks * 658 BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE; 659 660 if(NumBlocksFound > MaxBlocksFound) 787 if(thisRoll < 1) 661 788 { 662 abortSearch = true;663 break;789 ASSERT(remainderRoll == 0); 790 continue; // Back around to bufferOffset loop 664 791 } 792 // Roll high/low section. Our roll here will either exhaust the total or will 793 // leave the rolling checksum positioned a offset zero in the high buffer 794 int32_t lowHighRoll = ((maxScanBlockSize - bufferOffset) > thisRoll) ? thisRoll : (maxScanBlockSize - bufferOffset); 795 ASSERT(lowHighRoll > 0); 796 rollingSums[s]->RollForwardSeveral(lowBuffer + bufferOffset, highBuffer + (scanSizes[s] - (maxScanBlockSize - bufferOffset)), scanSizes[s], lowHighRoll); 797 bufferOffset += lowHighRoll; 798 carryOverBytes[s] = (thisRoll - lowHighRoll) + remainderRoll; 799 ASSERT(carryOverBytes[s] >= 0); 800 continue; // Back around to bufferOffset loop 665 801 } 666 802 667 // Roll checksum forward 668 rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]); 669 670 // Increment offsets 671 ++offset; 672 ++fileOffset; 673 } 674 675 if(abortSearch) break; 676 677 refresh: 678 // Finished? 679 if(bytesInEndings != Sizes[s]) 680 { 681 // No more data in file -- check the final block 682 // (Do a copy and paste of 5 lines of code instead of introducing a comparison for 683 // each byte of the file) 684 uint16_t hash = rolling.GetComponentForHashing(); 685 if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s])) 803 // Look for block match at this size 804 uint16_t hashValue = rollingSums[s]->GetComponentForHashing(); 805 if(hashTables[s][hashValue] != 0) 686 806 { 687 if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks)) 807 int32_t lowSize = maxScanBlockSize - bufferOffset; 808 if(lowSize > scanSizes[s]) lowSize = scanSizes[s]; 809 int32_t highSize = scanSizes[s] - lowSize; 810 if(SecondStageMatch(hashTables[s][hashValue], rollingSums[s], lowBuffer + bufferOffset, lowSize, 811 highBuffer, highSize, scanSizes[s], bufferFileOffset + bufferOffset, pIndex, rFoundBlocks)) 688 812 { 689 goodnessOfFit[fileOffset] = Sizes[s]; 813 #ifndef NDEBUG 814 BOX_TRACE("Found block match for hash " << hashValue << " of size " << scanSizes[s] << " at offset " << bufferFileOffset + bufferOffset); 815 #endif 816 // Update best fit so far 817 ASSERT((goodnessOfFit.count(bufferFileOffset + bufferOffset) == 0) || (goodnessOfFit[bufferFileOffset + bufferOffset] < scanSizes[s])); 818 goodnessOfFit[bufferFileOffset + bufferOffset] = scanSizes[s]; 819 pfitBitmap[bufferOffset >> 3] |= (1 << (bufferOffset & 0x7)); 820 821 // We've found a match, don't scan anymore if we don't expect more blocks of this size 822 --scanSizesCount[s]; 823 if(scanSizesCount[s] < 1) 824 { 825 scanThisSize[s] = false; 826 #ifndef NDEBUG 827 BOX_TRACE("Short-circuit size " << scanSizes[s]); 828 #endif 829 } 830 831 // Roll so we don't bother diffing the remainder of this block, see bestFitSoFarSize 832 // roll above for an explanation of the logic 833 int32_t thisRoll = scanSizes[s]; 834 int32_t remainderRoll = 0; 835 if(thisRoll > maxScanBlockSize) 836 { 837 thisRoll = maxScanBlockSize; 838 remainderRoll = bestFitSoFarSize - thisRoll; 839 } 840 int32_t lowRoll = ((maxScanBlockSize - bufferOffset - scanSizes[s]) > thisRoll) ? thisRoll : (maxScanBlockSize - bufferOffset - scanSizes[s]); 841 if(lowRoll > 0) 842 { 843 rollingSums[s]->RollForwardSeveral(lowBuffer + bufferOffset, lowBuffer + bufferOffset + scanSizes[s], scanSizes[s], lowRoll); 844 bufferOffset += lowRoll; 845 thisRoll -= lowRoll; 846 } 847 if(thisRoll < 1) 848 { 849 ASSERT(remainderRoll == 0); 850 continue; // Back around to bufferOffset loop 851 } 852 int32_t lowHighRoll = ((maxScanBlockSize - bufferOffset) > thisRoll) ? thisRoll : (maxScanBlockSize - bufferOffset); 853 ASSERT(lowHighRoll > 0); 854 rollingSums[s]->RollForwardSeveral(lowBuffer + bufferOffset, highBuffer + (scanSizes[s] - (maxScanBlockSize - bufferOffset)), scanSizes[s], lowHighRoll); 855 bufferOffset += lowHighRoll; 856 carryOverBytes[s] = (thisRoll - lowHighRoll) + remainderRoll; 857 ASSERT(carryOverBytes[s] >= 0); 858 continue; // Back around to bufferOffset loop 690 859 } 691 860 } 861 862 // Roll one byte, either low or low/high depending on current offset 863 if((bufferOffset + scanSizes[s]) >= maxScanBlockSize) 864 { 865 rollingSums[s]->RollForward(lowBuffer[bufferOffset], highBuffer[scanSizes[s] - (maxScanBlockSize - bufferOffset)], scanSizes[s]); 866 } 867 else 868 { 869 rollingSums[s]->RollForward(lowBuffer[bufferOffset], lowBuffer[bufferOffset + scanSizes[s]], scanSizes[s]); 870 } 871 ++bufferOffset; 872 } 692 873 693 // finish 874 // Check if we've found too much 875 int64_t NumBlocksFound = static_cast<int64_t>(rFoundBlocks.size()); 876 int64_t MaxBlocksFound = NumBlocks * BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE; 877 if(NumBlocksFound > MaxBlocksFound) 878 { 879 abortSearch = true; 694 880 break; 695 881 } 696 697 // Switch buffers, reset offset 698 beginnings = endings; 699 endings = (beginnings == pbuffer0)?(pbuffer1):(pbuffer0); // ie the other buffer 700 offset = 0; 882 } 883 // If we're aborting, pass through 884 if(abortSearch) break; 885 886 // Swap buffers so high is low and the go back around 887 uint8_t *swap = lowBuffer; 888 lowBuffer = highBuffer; 889 ASSERT((lowBufferBytes == highBufferBytes) && (lowBufferBytes == maxScanBlockSize)); 890 bufferFileOffset += maxScanBlockSize; 891 highBuffer = swap; 892 highBufferBytes = rFile.Read(highBuffer, maxScanBlockSize); 701 893 702 // And count the blocks which have been done 703 ++fileBlockNumber; 894 } // End of maxScanBlockSize reads 895 896 // What remains is either: 897 // - An abort or timeout 898 // - A low buffer with less than maxScanBlockSize bytes and an empty high buffer 899 // (Potentially zero low bytes for a zero-length file) 900 // - A maxScanBlockSize low buffer and a maxScanBlockSize or less high buffer 901 // 902 if(!abortSearch && !maximumDiffingTime.HasExpired() && (lowBufferBytes > 0)) 903 { 904 // We're going to repeat the logic in the previous read loop, but have 905 // to be careful not to walk off the end of the available data. To make this 906 // a lot less convoluted, we're just going to copy to consolidate buffers. 907 // This is cheating a bit, but worth the simplicity. 908 ASSERT((lowBufferBytes < maxScanBlockSize) ? (highBufferBytes == 0) : true); 909 ASSERT((highBufferBytes != 0) ? (lowBufferBytes == maxScanBlockSize) : true); 910 uint8_t *endingBuffer = lowBuffer; 911 if(highBufferBytes != 0) 912 { 913 ::memcpy(endingBuffer + maxScanBlockSize, highBuffer, highBufferBytes); 914 } 915 int32_t endingBytes = lowBufferBytes + highBufferBytes; 916 917 // Send keep alive 918 if(pDiffTimer) pDiffTimer->DoKeepAlive(); 919 920 // std::map still not winning 921 ::memset(pfitBitmap, 0, maxScanBlockSize * 2 / 8); 922 for(std::map<int64_t, int32_t>::const_iterator i(goodnessOfFit.lower_bound(bufferFileOffset)); i != goodnessOfFit.end(); ++i) 923 { 924 pfitBitmap[(i->first - bufferFileOffset) >> 3] |= (1 << ((i->first - bufferFileOffset) & 0x7)); 704 925 } 926 927 // Walk all block sizes in block-probability order 928 for(int s = 0; s < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++s) 929 { 930 // If there is no rolling checksum at this index, skip its either 931 // a zero size or the file was too small 932 if(!scanThisSize[s] || (rollingSums[s] == 0)) continue; 933 934 // If there's not enough data at the end, skip this block size 935 if(endingBytes < scanSizes[s]) continue; 705 936 706 if(abortSearch) break; 937 #ifndef NDEBUG 938 BOX_TRACE("Diff block size " << scanSizes[s] << " at file ending offset " << bufferFileOffset); 939 #endif 940 941 // Offset of this block buffer 942 int32_t bufferOffset = 0; 943 944 // For each block size we can only walk so far safely 945 int32_t maxSafeOffset = endingBytes - scanSizes[s]; 946 947 // Roll carry over 948 if(carryOverBytes[s] != 0) 949 { 950 ASSERT(carryOverBytes[s] > 0); 951 // Don't walk off the end 952 if(carryOverBytes[s] > maxSafeOffset) continue; 953 // Roll 954 rollingSums[s]->RollForwardSeveral(endingBuffer, endingBuffer + scanSizes[s], scanSizes[s], carryOverBytes[s]); 955 bufferOffset = carryOverBytes[s]; 956 carryOverBytes[s] = 0; // Not really needed again, but be clean 957 } 958 959 // Loop remaining end bytes, we must walk to exactly and then break the loop 960 while (bufferOffset <= maxSafeOffset) 961 { 962 // Look for larger size matches at this offset 963 int32_t bestFitSoFarSize = 0; 964 if(pfitBitmap[bufferOffset >> 3] & (1 << (bufferOffset & 0x7))) 965 { 966 bestFitSoFarSize = goodnessOfFit[bufferFileOffset + bufferOffset]; 967 } 968 if(bestFitSoFarSize >= scanSizes[s]) { 969 // Space for this roll? 970 if((bufferOffset + bestFitSoFarSize) > maxSafeOffset) break; // Kill byte walk loop 971 // Roll 972 rollingSums[s]->RollForwardSeveral(endingBuffer + bufferOffset, endingBuffer + bufferOffset + scanSizes[s], scanSizes[s], bestFitSoFarSize); 973 bufferOffset += bestFitSoFarSize; 974 continue; // Back around to byte read loop 975 } 976 977 // Look for block match at this size 978 uint16_t hashValue = rollingSums[s]->GetComponentForHashing(); 979 if(hashTables[s][hashValue] != 0) 980 { 981 if(SecondStageMatch(hashTables[s][hashValue], rollingSums[s], endingBuffer + bufferOffset, scanSizes[s], 982 0, 0, scanSizes[s], bufferFileOffset + bufferOffset, pIndex, rFoundBlocks)) 983 { 984 #ifndef NDEBUG 985 BOX_TRACE("Found block match for hash " << hashValue << " of size " << scanSizes[s] << " at offset " << bufferFileOffset + bufferOffset); 986 #endif 987 // Update best fit so far 988 ASSERT((goodnessOfFit.count(bufferFileOffset + bufferOffset) == 0) || (goodnessOfFit[bufferFileOffset + bufferOffset] < scanSizes[s])); 989 goodnessOfFit[bufferFileOffset + bufferOffset] = scanSizes[s]; 990 pfitBitmap[bufferOffset >> 3] |= (1 << (bufferOffset & 0x7)); 991 992 // We've found a match, don't scan anymore if we don't expect more blocks of this size 993 --scanSizesCount[s]; 994 if(scanSizesCount[s] < 1) 995 { 996 scanThisSize[s] = false; 997 #ifndef NDEBUG 998 BOX_TRACE("Short-circuit size " << scanSizes[s]); 999 #endif 1000 } 1001 1002 // Roll over this matched block 1003 if((bufferOffset + scanSizes[s]) > maxSafeOffset) break; // Kill byte walk loop 1004 rollingSums[s]->RollForwardSeveral(endingBuffer + bufferOffset, endingBuffer + bufferOffset + scanSizes[s], scanSizes[s], scanSizes[s]); 1005 bufferOffset += scanSizes[s]; 1006 continue; // Back around to byte read loop 1007 } 1008 } 1009 1010 // Abort check 1011 int64_t NumBlocksFound = static_cast<int64_t>(rFoundBlocks.size()); 1012 int64_t MaxBlocksFound = NumBlocks * BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE; 1013 if(NumBlocksFound > MaxBlocksFound) 1014 { 1015 abortSearch = true; 1016 break; 1017 } 1018 1019 // Because byte loop conditional can equal maxSafeOffset we must not read off 1020 // end here 1021 if(bufferOffset == maxSafeOffset) break; // Done with reading bytes 1022 rollingSums[s]->RollForward(endingBuffer[bufferOffset], endingBuffer[bufferOffset + scanSizes[s]], scanSizes[s]); 1023 ++bufferOffset; 1024 } 1025 if(abortSearch) break; 1026 // NOTE: Nothing else can go here without restructuring use of "break" in the byte read loop above 1027 } 707 1028 } 708 1029 709 // Free buffers and hash table 710 ::free(pbuffer1); 711 pbuffer1 = 0; 1030 // Clean up 712 1031 ::free(pbuffer0); 713 1032 pbuffer0 = 0; 714 ::free(phashTable); 715 phashTable = 0; 1033 ::free(pbuffer1); 1034 pbuffer1 = 0; 1035 ::free(pfitBitmap); 1036 pfitBitmap = 0; 1037 for(int z = 0; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z) 1038 { 1039 if(hashTables[z] != 0) 1040 { 1041 free(hashTables[z]); 1042 hashTables[z] = 0; 1043 } 1044 if(rollingSums[z] != 0) 1045 { 1046 delete rollingSums[z]; 1047 rollingSums[z] = 0; 1048 } 1049 } 716 1050 } 717 1051 catch(...) 718 1052 { 719 // Cleanup and throw 720 if(pbuffer1 != 0) ::free(pbuffer1); 1053 // Cleanup and rethrow 721 1054 if(pbuffer0 != 0) ::free(pbuffer0); 722 if(phashTable != 0) ::free(phashTable); 1055 if(pbuffer1 != 0) ::free(pbuffer1); 1056 if(pfitBitmap != 0) ::free(pfitBitmap); 1057 for(int z = 0; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z) 1058 { 1059 if(hashTables[z] != 0) free(hashTables[z]); 1060 } 1061 for(int z = 0; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z) 1062 { 1063 if(rollingSums[z] != 0) delete rollingSums[z]; 1064 } 723 1065 throw; 724 1066 } 725 1067 726 1068 #ifndef NDEBUG 727 if(BackupStoreFile::TraceDetailsOfDiffProcess) 728 { 729 // Trace out the found blocks in debug mode 730 BOX_TRACE("Diff: list of found blocks"); 731 BOX_TRACE("======== ======== ======== ========"); 732 BOX_TRACE(" Offset BlkIdx Size Movement"); 733 for(std::map<int64_t, int64_t>::const_iterator i(rFoundBlocks.begin()); i != rFoundBlocks.end(); ++i) 734 { 735 int64_t orgLoc = 0; 736 for(int64_t b = 0; b < i->second; ++b) 737 { 738 orgLoc += pIndex[b].mSize; 739 } 740 BOX_TRACE(std::setw(8) << i->first << " " << 741 std::setw(8) << i->second << " " << 742 std::setw(8) << pIndex[i->second].mSize << 743 " " << 744 std::setw(8) << (i->first - orgLoc)); 745 } 746 BOX_TRACE("======== ======== ======== ========"); 1069 dumpDiffList: 1070 // Trace out the found blocks in debug mode 1071 BOX_TRACE("Diff: list of found blocks"); 1072 BOX_TRACE("======== ======== ======== ========"); 1073 BOX_TRACE(" Offset BlkIdx Size Movement"); 1074 for(std::map<int64_t, int64_t>::const_iterator i(rFoundBlocks.begin()); i != rFoundBlocks.end(); ++i) 1075 { 1076 int64_t orgLoc = 0; 1077 for(int64_t b = 0; b < i->second; ++b) 1078 { 1079 orgLoc += pIndex[b].mSize; 1080 } 1081 BOX_TRACE(std::setw(8) << i->first << " " << 1082 std::setw(8) << i->second << " " << 1083 std::setw(8) << pIndex[i->second].mSize << 1084 " " << 1085 std::setw(8) << (i->first - orgLoc)); 747 1086 } 1087 BOX_TRACE("======== ======== ======== ========"); 748 1088 #endif 749 1089 } 750 1090 … … 776 1116 { 777 1117 //TRACE1("Another hash entry for %d found\n", hash); 778 1118 // Yes -- need to set the pointer in this entry to the current entry to build the linked list 1119 // This is safe because there is only one hash table per block size and block sizes 1120 // are unique. 779 1121 pIndex[b].mpNextInHashList = pHashTable[hash]; 780 1122 } 781 1123 … … 785 1127 } 786 1128 } 787 1129 788 789 1130 // -------------------------------------------------------------------------- 790 1131 // 791 1132 // Function … … 794 1135 // Created: 14/1/04 795 1136 // 796 1137 // -------------------------------------------------------------------------- 797 static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, 798 int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks) 1138 static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum *pFastSum, 1139 uint8_t *pLow, int32_t LowSize, uint8_t *pHigh, int32_t HighSize, int32_t BlockSize, int64_t FileOffset, 1140 BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks) 799 1141 { 800 1142 // Check parameters 801 ASSERT(pBeginnings != 0);802 ASSERT(pEndings != 0);803 ASSERT(Offset >= 0);804 ASSERT(BlockSize > 0);805 1143 ASSERT(pFirstInHashList != 0); 1144 ASSERT(pFastSum != 0); 1145 ASSERT(pLow != 0); 1146 ASSERT(LowSize != 0); 1147 ASSERT((HighSize > 0) ? (pHigh != 0) : true); 1148 ASSERT(BlockSize > 0); 1149 ASSERT(BlockSize == (LowSize + HighSize)); 806 1150 ASSERT(pIndex != 0); 807 1151 808 1152 #ifndef NDEBUG 809 uint16_t DEBUG_Hash = fastSum.GetComponentForHashing();1153 uint16_t DEBUG_Hash = pFastSum->GetComponentForHashing(); 810 1154 #endif 811 uint32_t Checksum = fastSum.GetChecksum();1155 uint32_t Checksum = pFastSum->GetChecksum(); 812 1156 813 1157 // Before we go to the expense of the MD5, make sure it's a darn good match on the checksum we already know. 814 1158 BlocksAvailableEntry *scan = pFirstInHashList; … … 829 1173 830 1174 // Calculate the strong MD5 digest for this block 831 1175 MD5Digest strong; 832 // Add the data from the beginnings 833 strong.Add(pBeginnings + Offset, BlockSize - Offset); 834 // Add any data from the endings 835 if(Offset > 0) 836 { 837 strong.Add(pEndings, Offset); 838 } 1176 strong.Add(pLow, LowSize); 1177 if(HighSize > 0) strong.Add(pHigh, HighSize); 839 1178 strong.Finish(); 840 1179 841 1180 // Then go through the entries in the hash list, comparing with the strong digest calculated 842 1181 scan = pFirstInHashList; 843 //TRACE0("second stage match\n");844 1182 while(scan != 0) 845 1183 { 846 1184 //TRACE3("scan size %d, block size %d, hash %d\n", scan->mSize, BlockSize, Hash); 847 1185 ASSERT(scan->mSize == BlockSize); 848 1186 ASSERT(RollingChecksum::ExtractHashingComponent(scan->mWeakChecksum) == DEBUG_Hash); 849 1187 850 1188 // Compare? 851 1189 if(strong.DigestMatches(scan->mStrongChecksum)) 852 1190 { 853 //TRACE0("Match!\n");854 1191 // Found! Add to list of found blocks... 855 int64_t fileOffset = (FileBlockNumber * BlockSize) + Offset;856 1192 int64_t blockIndex = (scan - pIndex); // pointer arthmitic is frowned upon. But most efficient way of doing it here -- alternative is to use more memory 857 1193 858 1194 // We do NOT search for smallest blocks first, as this code originally assumed. 859 1195 // To prevent this from potentially overwriting a better match, the caller must determine 860 1196 // the relative "goodness" of any existing match and this one, and avoid the call if it 861 1197 // could be detrimental. 862 rFoundBlocks[ fileOffset] = blockIndex;1198 rFoundBlocks[FileOffset] = blockIndex; 863 1199 864 1200 // No point in searching further, report success 865 1201 return true; 866 1202 } 867 868 1203 // Next 869 1204 scan = scan->mpNextInHashList; 870 1205 } … … 909 1244 instruction.mSpaceBefore = SizeOfInputFile; 910 1245 rRecipe.push_back(instruction); 911 1246 912 #ifndef NDEBUG 913 if(BackupStoreFile::TraceDetailsOfDiffProcess) 914 { 915 TRACE1("Diff: Default recipe generated, %lld bytes of file\n", SizeOfInputFile); 916 } 917 #endif 1247 #ifndef NDEBUG 1248 TRACE1("Diff: Default recipe generated, %lld bytes of file\n", SizeOfInputFile); 1249 #endif 918 1250 919 1251 // Don't do anything 920 1252 return; … … 1006 1338 // dump out the recipe 1007 1339 #ifndef NDEBUG 1008 1340 TRACE2("Diff: %lld new bytes found, %lld old blocks used\n", debug_NewBytesFound, debug_OldBlocksUsed); 1009 if(BackupStoreFile::TraceDetailsOfDiffProcess)1341 TRACE1("Diff: Recipe generated (size %d)\n======== ========= ========\nSpace b4 FirstBlk NumBlks\n", rRecipe.size()); 1010 1342 { 1011 TRACE1("Diff: Recipe generated (size %d)\n======== ========= ========\nSpace b4 FirstBlk NumBlks\n", rRecipe.size());1343 for(unsigned int e = 0; e < rRecipe.size(); ++e) 1012 1344 { 1013 for(unsigned int e = 0; e < rRecipe.size(); ++e) 1014 { 1015 char b[64]; 1345 char b[64]; 1016 1346 #ifdef WIN32 1017 sprintf(b, "%8I64d", (int64_t)(rRecipe[e].mpStartBlock - pIndex));1347 sprintf(b, "%8I64d", (int64_t)(rRecipe[e].mpStartBlock - pIndex)); 1018 1348 #else 1019 sprintf(b, "%8lld", (int64_t)(rRecipe[e].mpStartBlock - pIndex));1349 sprintf(b, "%8lld", (int64_t)(rRecipe[e].mpStartBlock - pIndex)); 1020 1350 #endif 1021 TRACE3("%8lld %s %8lld\n", rRecipe[e].mSpaceBefore, (rRecipe[e].mpStartBlock == 0)?" -":b, (int64_t)rRecipe[e].mBlocks); 1022 } 1351 TRACE3("%8lld %s %8lld\n", rRecipe[e].mSpaceBefore, (rRecipe[e].mpStartBlock == 0)?" -":b, (int64_t)rRecipe[e].mBlocks); 1023 1352 } 1024 TRACE0("======== ========= ========\n");1025 1353 } 1354 TRACE0("======== ========= ========\n"); 1026 1355 #endif 1027 1356 }
