Ticket #45: BackupFileDiff.2361.patch
| File BackupFileDiff.2361.patch, 48.3 KB (added by chris, 3 years ago) |
|---|
-
lib/backupclient/BackupStoreFile.h
214 214 static void ResetStats(); 215 215 static BackupStoreFileStats msStats; 216 216 217 // For debug218 #ifndef NDEBUG219 static bool TraceDetailsOfDiffProcess;220 #endif221 222 217 // For decoding encoded files 223 218 static void DumpFile(void *clibFileHandle, bool ToTrace, IOStream &rFile); 224 219 }; -
test/backupdiff/testbackupdiff.cpp
390 390 // Want to trace out all the details 391 391 #ifndef NDEBUG 392 392 #ifndef WIN32 393 BackupStoreFile::TraceDetailsOfDiffProcess = true;393 Logging::SetGlobalLevel(Log::TRACE); 394 394 #endif 395 395 #endif 396 396 -
lib/backupclient/BackupStoreFileDiff.cpp
38 38 using namespace BackupStoreFileCryptVar; 39 39 using namespace BackupStoreFileCreation; 40 40 41 // By default, don't trace out details of the diff as we go along -- would fill up logs significantly. 42 // But it's useful for the test. 43 #ifndef NDEBUG 44 bool BackupStoreFile::TraceDetailsOfDiffProcess = false; 45 #endif 46 47 static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntry **ppIndex, int64_t &rNumBlocksOut, int Timeout, bool &rCanDiffFromThis); 48 static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]); 49 static void SearchForMatchingBlocks(IOStream &rFile, 50 std::map<int64_t, int64_t> &rFoundBlocks, BlocksAvailableEntry *pIndex, 51 int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES], 41 static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, 42 BlocksAvailableEntry **ppIndex, int64_t &rNumBlocksOut, 43 int Timeout, bool &rCanDiffFromThis); 44 static void SearchForMatchingBlocks(IOStream &rFile, 45 std::map<int64_t, int64_t> &rFoundBlocks, 46 BlocksAvailableEntry *pIndex, int64_t NumBlocks, 52 47 DiffTimer *pDiffTimer); 53 static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable); 54 static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber, 55 BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks); 56 static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex, int64_t NumBlocks, std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile); 48 static void SetupHashTable(BlocksAvailableEntry *pIndex, 49 int64_t NumBlocks, int32_t BlockSize, 50 BlocksAvailableEntry **pHashTable); 51 static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, 52 RollingChecksum *pFastSum, 53 uint8_t *pLow, int32_t LowSize, 54 uint8_t *pHigh, int32_t HighSize, 55 int32_t BlockSize, int64_t FileOffset, 56 BlocksAvailableEntry *pIndex, 57 std::map<int64_t, int64_t> &rFoundBlocks); 58 static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, 59 BlocksAvailableEntry *pIndex, int64_t NumBlocks, 60 std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile); 57 61 58 62 // -------------------------------------------------------------------------- 59 63 // … … 168 172 169 173 try 170 174 { 171 // Find which sizes should be scanned172 int32_t sizesToScan[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];173 FindMostUsedSizes(pindex, blocksInIndex, sizesToScan);174 175 175 // Flag for reporting to the user 176 176 bool completelyDifferent; 177 177 … … 186 186 // Get size of file 187 187 sizeOfInputFile = file.BytesLeftToRead(); 188 188 // Find all those lovely matching blocks 189 SearchForMatchingBlocks(file, foundBlocks, pindex, 190 blocksInIndex, sizesToScan, pDiffTimer); 189 SearchForMatchingBlocks(file, foundBlocks, pindex, blocksInIndex, pDiffTimer); 191 190 192 191 // Is it completely different? 193 192 completelyDifferent = (foundBlocks.size() == 0); … … 363 362 } 364 363 } 365 364 366 367 365 // -------------------------------------------------------------------------- 368 366 // 369 367 // Function 370 // Name: static FindMostUsedSizes(BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]) 371 // Purpose: Finds the most commonly used block sizes in the index 368 // Name: static void SearchForMatchingBlocks(IOStream &, 369 // std::map<int64_t, int64_t> &, 370 // BlocksAvailableEntry *, int64_t, DiffTimer *) 371 // Purpose: Find the matching blocks within the file. 372 372 // Created: 12/1/04 373 373 // 374 374 // -------------------------------------------------------------------------- 375 static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]) 375 static void SearchForMatchingBlocks(IOStream &rFile, 376 std::map<int64_t, int64_t> &rFoundBlocks, 377 BlocksAvailableEntry *pIndex, int64_t NumBlocks, DiffTimer *pDiffTimer) 376 378 { 377 // Array for lengths 378 int64_t sizeCounts[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; 379 Timer maximumDiffingTime(0, "MaximumDiffingTime"); 379 380 380 // Set arrays to lots of zeros (= unused entries) 381 for(int l = 0; l < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++l) 381 if(pDiffTimer && pDiffTimer->IsManaged()) 382 382 { 383 Sizes[l] = 0;384 sizeCounts[l] = 0;383 maximumDiffingTime = Timer(pDiffTimer->GetMaximumDiffingTime(), 384 "MaximumDiffingTime"); 385 385 } 386 386 387 // Array for collecting sizes 388 std::map<int32_t, int64_t> foundSizes; 387 // Flag to abort the run, if too many blocks are found or the diffing 388 // timeout expires 389 bool abortSearch = false; 389 390 390 // Run through blocks and make a count of the entries 391 for(int64_t b = 0; b < NumBlocks; ++b) 391 // Buffers used during both phases of search 392 uint8_t *pbuffer0 = 0; 393 uint8_t *pbuffer1 = 0; 394 395 // Track offsets that already have block matches. We don't really care 396 // if its sorted, and this actually produces a performance issue, so 397 // see pfitBitmap for a workaround 398 std::map<int64_t, int32_t> goodnessOfFit; 399 400 // Collect sizes that aren't found in the file at their old offset 401 std::map<int32_t, int64_t> unmatchedSizes; 402 403 // Our arrays of block sizes during second search pass 404 int32_t scanSizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; 405 int64_t scanSizesCount[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; 406 int32_t maxScanBlockSize = 0; 407 ::memset(scanSizes, 0, (sizeof(int32_t) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES)); 408 ::memset(scanSizesCount, 0, (sizeof(int64_t) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES)); 409 410 // We need to keep separate rolling checksums for each block size in second search 411 RollingChecksum *rollingSums[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; 412 ::memset(rollingSums, 0, (sizeof(RollingChecksum *) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES)); 413 414 // And a hash lookup table per block size in seacond search 415 BlocksAvailableEntry **hashTables[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; 416 ::memset(hashTables, 0, (sizeof(BlocksAvailableEntry **) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES)); 417 418 // We allow second search pass to short-circuit off rare blocks 419 bool scanThisSize[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; 420 ::memset(scanThisSize, 0, (sizeof(bool) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES)); 421 422 // During block read we have a bitmap of prefit locations to avoid std::map 423 // performance problems 424 uint8_t *pfitBitmap = 0; 425 426 427 // First search pass... 428 // 429 // For many files (especially large ones) most of the file is unchanged. 430 // The RollingChecksum process requires us to read every byte of the file looking for 431 // blocks that have moved. However, we can make that process more efficient by 432 // quickly rolling over areas that match a different block. We can also use 433 // this to eliminate entirely the rolling checksum for block sizes that only exist 434 // at one location in the file. 435 // 436 // Thus we start by looking for blocks that have not moved. Only if a block 437 // cannot be found at its previous location do we consider scanning for it by 438 // rolling checksum size. 439 // 440 // This strategy has some disadvantages for files with lots of repeating content 441 // that happens to align with our block size, but the reduction in diff time 442 // for more typical files is worth it. 443 // 444 // Note also that in this pass we consider _all_ block sizes (smaller than 445 // BACKUP_FILE_MAX_BLOCK_SIZE). Any block size, no matter how small or rare 446 // is cheap for us to find in this pass. 447 // 448 pbuffer0 = (uint8_t *)::malloc(BACKUP_FILE_MAX_BLOCK_SIZE); 449 try 392 450 { 393 // Only if the block size is bigger than the minimum size we'll scan for 394 if(pIndex[b].mSize > BACKUP_FILE_DIFF_MIN_BLOCK_SIZE) 451 if(pbuffer0 == 0) 395 452 { 396 // Find entry? 397 std::map<int32_t, int64_t>::const_iterator f(foundSizes.find(pIndex[b].mSize)); 398 if(f != foundSizes.end()) 453 throw std::bad_alloc(); 454 } 455 456 // We have to track file offset since the read may fail 457 int64_t fileOffset = 0; 458 459 // Walk the blocks 460 for(int64_t b = 0; b < NumBlocks; ++b) 461 { 462 // Check diffing timeout 463 if(maximumDiffingTime.HasExpired()) 399 464 { 400 // Increment existing entry 401 foundSizes[pIndex[b].mSize] = foundSizes[pIndex[b].mSize] + 1; 465 ASSERT(pDiffTimer != NULL); 466 BOX_INFO("MaximumDiffingTime reached - suspending file diff"); 467 abortSearch = true; 468 break; 402 469 } 403 else 470 471 // Send keep alive 472 if(pDiffTimer) pDiffTimer->DoKeepAlive(); 473 474 // Skip blocks too large for our buffer 475 if(pIndex[b].mSize > BACKUP_FILE_MAX_BLOCK_SIZE) { 476 fileOffset += pIndex[b].mSize; 477 continue; 478 } 479 480 // Have to guard the seek operation, it could throw. We don't know size of the 481 // current file, and checking is pointless anyway since there's a race. 482 // In reality on Unix this is implemented with lseek which will mean we can 483 // seek past the EOF, but I don't want to make assumptions about Win32. 484 int32_t readSize = 0; 485 try 404 486 { 405 // New entry406 foundSizes[pIndex[b].mSize] = 1;487 rFile.Seek(fileOffset, IOStream::SeekType_Absolute); 488 readSize = rFile.Read(pbuffer0, pIndex[b].mSize); 407 489 } 490 catch(BoxException &e) 491 { 492 if(e.GetType() != CommonException::ExceptionType || e.GetSubType() != CommonException::OSFileError) 493 { 494 // Not what we expected, rethrow 495 throw; 496 } 497 } 498 499 // Check for a match 500 bool blockMatched = false; 501 if(readSize == pIndex[b].mSize) 502 { 503 // We don't have a rolling checksum to this point, so all we can do is MD5. If you 504 // worry this is expensive just remember that prior versions of this code 505 // re-read the file BACKUP_FILE_DIFF_MAX_BLOCK_SIZES times and calculated 506 // rolling checksums every time. 507 MD5Digest strong; 508 strong.Add(pbuffer0, pIndex[b].mSize); 509 strong.Finish(); 510 511 // Do we have a match? 512 if(strong.DigestMatches(pIndex[b].mStrongChecksum)) 513 { 514 #ifndef NDEBUG 515 BOX_TRACE("Found unchanged block of size " << pIndex[b].mSize << " at offset " << fileOffset); 516 #endif 517 rFoundBlocks[fileOffset] = b; 518 goodnessOfFit[fileOffset] = pIndex[b].mSize; 519 blockMatched = true; 520 } 521 } 522 523 // We're done with the file offset, so increment now 524 fileOffset += pIndex[b].mSize; 525 526 // If the block didn't match then this is a size we'll have to scan for 527 if(!blockMatched && (pIndex[b].mSize >= BACKUP_FILE_DIFF_MIN_BLOCK_SIZE)) 528 { 529 // Find entry? 530 std::map<int32_t, int64_t>::const_iterator f(unmatchedSizes.find(pIndex[b].mSize)); 531 if(f != unmatchedSizes.end()) 532 { 533 // Increment existing entry 534 unmatchedSizes[pIndex[b].mSize] = unmatchedSizes[pIndex[b].mSize] + 1; 535 } 536 else 537 { 538 // New entry 539 unmatchedSizes[pIndex[b].mSize] = 1; 540 } 541 } 408 542 } 543 544 // Cleanup 545 ::free(pbuffer0); 546 pbuffer0 = 0; 409 547 } 548 catch(...) 549 { 550 // Cleanup and rethrow 551 if(pbuffer0 != 0) ::free(pbuffer0); 552 throw; 553 } 554 555 // Are we already out of time? 556 if(abortSearch) 557 { 558 #ifndef NDEBUG 559 goto dumpDiffList; 560 #endif 561 return; 562 } 410 563 411 // Make the block sizes 412 for(std::map<int32_t, int64_t>::const_iterator i(foundSizes.begin()); i != foundSizes.end(); ++i) 564 565 // Second search pass... 566 // 567 // In our second phase, having matched all unchanged blocks we now need 568 // to scan for moved blocks. This involves looping across all unmatched 569 // block sizes and using the rolling checksum to look for relocations. To keep 570 // this from being too expensive we cap at BACKUP_FILE_DIFF_MAX_BLOCK_SIZES 571 // for the number of sizes to scan. We also scan for the blocks in order of 572 // their relative probability, a block size that occurs frequently is scanned 573 // first. 574 // 575 576 // Loop all sizes inserting higher usages into the array 577 for(std::map<int32_t, int64_t>::const_iterator i(unmatchedSizes.begin()); i != unmatchedSizes.end(); ++i) 413 578 { 414 // Find the position of the size in the array 579 // TODO: Scanning for any block size isn't cheap, and realistically in many cases 580 // it would be less expensive to upload a few thousand bytes rather than do the scan. 581 // Here would be a good place to filter block sizes that aren't worth the effort 582 // once a suitable set of heuristics is found. 583 415 584 for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t) 416 585 { 417 // Instead of sorting on the raw count of blocks, 418 // take the file area covered by this block size. 419 if(i->second * i->first > sizeCounts[t] * Sizes[t]) 586 // Instead of sorting on the raw count of blocks, take the file area covered by this 587 // block size. This helps avoid favoring low numbers of large blocks over many 588 // small blocks. 589 if((i->second * i->first) > (scanSizesCount[t] * scanSizes[t])) 420 590 { 421 591 // Then this size belong before this entry -- shuffle them up 422 592 for(int s = (BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1); s >= t; --s) 423 593 { 424 Sizes[s] =Sizes[s-1];425 s izeCounts[s] = sizeCounts[s-1];594 scanSizes[s] = scanSizes[s-1]; 595 scanSizesCount[s] = scanSizesCount[s-1]; 426 596 } 427 597 428 598 // Insert this size 429 Sizes[t] = i->first;430 s izeCounts[t] = i->second;599 scanSizes[t] = i->first; 600 scanSizesCount[t] = i->second; 431 601 432 // Shouldn't do any more searching 602 // Update max size 603 if(maxScanBlockSize < i->first) maxScanBlockSize = i->first; 604 605 // Shouldn't do any more searching for this size 433 606 break; 434 607 } 435 608 } 436 609 } 437 438 // tracethe size table in debug builds610 611 // Dump the size table in debug builds 439 612 #ifndef NDEBUG 440 if(BackupStoreFile::TraceDetailsOfDiffProcess)613 for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t) 441 614 { 442 for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t)615 if(scanSizes[t] != 0) 443 616 { 444 BOX_TRACE(" Diffblock size " << t << ": " <<445 Sizes[t] << " (count = " <<446 s izeCounts[t] << ")");617 BOX_TRACE("Scan block size " << t << ": " << 618 scanSizes[t] << " count: " << 619 scanSizesCount[t]); 447 620 } 448 621 } 449 622 #endif 450 }451 623 452 453 454 // -------------------------------------------------------------------------- 455 // 456 // Function 457 // Name: static SearchForMatchingBlocks(IOStream &, std::map<int64_t, int64_t> &, BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]) 458 // Purpose: Find the matching blocks within the file. 459 // Created: 12/1/04 460 // 461 // -------------------------------------------------------------------------- 462 static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks, 463 BlocksAvailableEntry *pIndex, int64_t NumBlocks, 464 int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES], DiffTimer *pDiffTimer) 465 { 466 Timer maximumDiffingTime(0, "MaximumDiffingTime"); 467 468 if(pDiffTimer && pDiffTimer->IsManaged()) 624 // If we didn't find any sizes (could happen if all were found in the first matching 625 // phase) we're done. 626 if(maxScanBlockSize == 0) 469 627 { 470 maximumDiffingTime = Timer(pDiffTimer->GetMaximumDiffingTime(), 471 "MaximumDiffingTime"); 628 #ifndef NDEBUG 629 BOX_TRACE("No scan block sizes, skip rolling checksum"); 630 goto dumpDiffList; 631 #endif 632 return; 472 633 } 473 474 std::map<int64_t, int32_t> goodnessOfFit;475 634 476 // Allocate the hash lookup table 477 BlocksAvailableEntry **phashTable = (BlocksAvailableEntry **)::malloc(sizeof(BlocksAvailableEntry *) * (64*1024)); 635 ASSERT(maxScanBlockSize <= BACKUP_FILE_MAX_BLOCK_SIZE); 478 636 479 // Choose a size for the buffer, just a little bit more than the maximum block size 480 int32_t bufSize = Sizes[0]; 481 for(int z = 1; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z) 482 { 483 if(Sizes[z] > bufSize) bufSize = Sizes[z]; 484 } 485 bufSize += 4; 486 ASSERT(bufSize > Sizes[0]); 487 ASSERT(bufSize > 0); 488 if(bufSize > (BACKUP_FILE_MAX_BLOCK_SIZE + 1024)) 489 { 490 THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) 491 } 637 // Allocate two buffers we'll toggle between at the max scan block size 638 // There sizes are doubled to make final block at the end easier (we're cheating) 639 pbuffer0 = (uint8_t *)::malloc(maxScanBlockSize * 2); 640 pbuffer1 = (uint8_t *)::malloc(maxScanBlockSize * 2); 492 641 493 // TODO: Because we read in the file a scanned block size at a time, 494 // it is likely to be inefficient. Probably will be much better to 495 // calculate checksums for all block sizes in a single pass. 496 497 // Allocate the buffers. 498 uint8_t *pbuffer0 = (uint8_t *)::malloc(bufSize); 499 uint8_t *pbuffer1 = (uint8_t *)::malloc(bufSize); 642 // Allocate a bitmap buffer to optimize goodnessOfFit access 643 pfitBitmap = (uint8_t *)::malloc(maxScanBlockSize * 2 / 8); 644 500 645 try 501 646 { 502 647 // Check buffer allocation 503 if(pbuffer0 == 0 || pbuffer1 == 0 || p hashTable== 0)648 if(pbuffer0 == 0 || pbuffer1 == 0 || pfitBitmap == 0) 504 649 { 505 650 // If a buffer got allocated, it will be cleaned up in the catch block 506 651 throw std::bad_alloc(); 507 652 } 508 653 509 // Flag to abort the run, if too many blocks are found -- avoid using510 // huge amounts of processor time when files contain many similar blocks.511 bool abortSearch = false;654 // Shift file position back to beginning 655 int64_t bufferFileOffset = 0; 656 rFile.Seek(0, IOStream::SeekType_Absolute); 512 657 513 // Search for each block size in turn 514 // NOTE: Do the smallest size first, so that the scheme for adding 515 // entries in the found list works as expected and replaces smallers block 516 // with larger blocks when it finds matches at the same offset in the file. 517 for(int s = BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1; s >= 0; --s) 658 // We're going to be flipping back and forth between two buffers, the low and high 659 uint8_t *lowBuffer = pbuffer0; 660 int32_t lowBufferBytes = 0; 661 uint8_t *highBuffer = pbuffer1; 662 int32_t highBufferBytes = 0; 663 664 // In some cases we need to carry over reads from prior buffers 665 int32_t carryOverBytes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; 666 ::memset(carryOverBytes, 0, (sizeof(int32_t) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES)); 667 668 // Read the first buffer's worth of data 669 lowBufferBytes = rFile.Read(lowBuffer, maxScanBlockSize); 670 // Fill the second buffer if appropriate 671 if(lowBufferBytes == maxScanBlockSize) 518 672 { 519 ASSERT(Sizes[s] <= bufSize); 520 BOX_TRACE("Diff pass " << s << ", for block size " << 521 Sizes[s]); 522 523 // Check we haven't finished 524 if(Sizes[s] == 0) 673 highBufferBytes = rFile.Read(highBuffer, maxScanBlockSize); 674 } 675 676 // For every block size, initialize our scan tracking 677 for(int z = 0; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z) 678 { 679 ASSERT(scanSizes[z] <= maxScanBlockSize); 680 681 // The sizes array may be mostly empty, in those cases we have no 682 // state to maintain. 683 if(scanSizes[z] != 0) 525 684 { 526 // empty entry, try next size 527 continue; 685 // Mark for scan 686 scanThisSize[z] = true; 687 688 // Set up the hash table for this size 689 hashTables[z] = (BlocksAvailableEntry **)::malloc(sizeof(BlocksAvailableEntry *) * (64*1024)); 690 if(hashTables[z] == 0) 691 { 692 throw std::bad_alloc(); 693 } 694 SetupHashTable(pIndex, NumBlocks, scanSizes[z], hashTables[z]); 695 696 // Set up a rolling checksum, but only if there is enough data to start with 697 // (file may now be shorter than some block sizes previously used) 698 if(scanSizes[z] <= lowBufferBytes) 699 { 700 rollingSums[z] = new RollingChecksum(lowBuffer, scanSizes[z]); 701 } 528 702 } 529 530 // Set up the hash table entries 531 SetupHashTable(pIndex, NumBlocks, Sizes[s], phashTable); 703 } 532 704 533 // Shift file position to beginning 534 rFile.Seek(0, IOStream::SeekType_Absolute); 535 536 // Read first block 537 if(rFile.Read(pbuffer0, Sizes[s]) != Sizes[s]) 705 // Read loop while we can get full maxScanBlockSize reads 706 while(highBufferBytes == maxScanBlockSize) 707 { 708 // Oh happy day! We have maxScanBlockSize * 2 bytes available to us across 709 // the two buffers, which means we can walk every block size across all offsets 710 // in lowBuffer without any concern about overrunning the data available in highBuffer. 711 712 // Check diffing timeout 713 if(maximumDiffingTime.HasExpired()) 538 714 { 539 // Size of file too short to match -- do next size 540 continue; 715 ASSERT(pDiffTimer != NULL); 716 BOX_INFO("MaximumDiffingTime reached - suspending file diff"); 717 abortSearch = true; 718 break; 541 719 } 542 720 543 // Setup block pointers 544 uint8_t *beginnings = pbuffer0; 545 uint8_t *endings = pbuffer1; 546 int offset = 0; 721 // Send keep alive 722 if(pDiffTimer) pDiffTimer->DoKeepAlive(); 547 723 548 // Calculate the first checksum, ready for rolling 549 RollingChecksum rolling(beginnings, Sizes[s]); 724 // Don't you wish hash_map was standard? std::map is very slow 725 // when we access it as often as we do in the loop below. To work around 726 // we'll create a bitmap of the previous fits in this buffer 727 ::memset(pfitBitmap, 0, maxScanBlockSize * 2 / 8); 728 for(std::map<int64_t, int32_t>::const_iterator i(goodnessOfFit.lower_bound(bufferFileOffset)); i != goodnessOfFit.end(); ++i) 729 { 730 if(i->first >= (bufferFileOffset + maxScanBlockSize)) break; 731 pfitBitmap[(i->first - bufferFileOffset) >> 3] |= (1 << ((i->first - bufferFileOffset) & 0x7)); 732 } 550 733 551 // Then roll, until the file is exhausted 552 int64_t fileBlockNumber = 0; 553 int64_t fileOffset = 0; 554 int rollOverInitialBytes = 0; 555 while(true) 734 // Walk all block sizes in block-probability order 735 for(int s = 0; s < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++s) 556 736 { 557 if(maximumDiffingTime.HasExpired()) 558 { 559 ASSERT(pDiffTimer != NULL); 560 BOX_INFO("MaximumDiffingTime reached - " 561 "suspending file diff"); 562 abortSearch = true; 563 break; 564 } 737 // If there is no rolling checksum at this index, skip its either 738 // a zero size or the file was too small 739 if(!scanThisSize[s] || (rollingSums[s] == 0)) continue; 740 741 #ifndef NDEBUG 742 BOX_TRACE("Diff block size " << scanSizes[s] << " at file offset " << bufferFileOffset); 743 #endif 565 744 566 if(pDiffTimer) 567 { 568 pDiffTimer->DoKeepAlive(); 569 } 570 571 // Load in another block of data, and record how big it is 572 int bytesInEndings = rFile.Read(endings, Sizes[s]); 573 int tmp; 745 // Offset of this block buffer 746 int32_t bufferOffset = 0; 574 747 575 // Skip any bytes from a previous matched block576 if( rollOverInitialBytes > 0 && offset < bytesInEndings)748 // Roll carry over 749 if(carryOverBytes[s] != 0) 577 750 { 578 int spaceLeft = bytesInEndings - offset; 579 int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes; 751 ASSERT(carryOverBytes[s] > 0); 752 // Carry can be bigger than maxScanBlockSize because first matching 753 // phase might have used bigger blocks than the current max scan size 754 int32_t thisCarry = carryOverBytes[s]; 755 if(thisCarry > maxScanBlockSize) 756 { 757 thisCarry = maxScanBlockSize; 758 } 580 759 581 rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll); 582 583 offset += thisRoll; 584 fileOffset += thisRoll; 585 rollOverInitialBytes -= thisRoll; 586 587 if(rollOverInitialBytes) 760 // Perform carry 761 if((thisCarry + scanSizes[s]) > maxScanBlockSize) 588 762 { 589 goto refresh; 763 // Roll is split some low, some low/high 764 int32_t lowRollBytes = maxScanBlockSize - scanSizes[s]; 765 ASSERT(lowRollBytes >= 0); 766 if(lowRollBytes > 0) 767 { 768 rollingSums[s]->RollForwardSeveral(lowBuffer, lowBuffer + scanSizes[s], scanSizes[s], lowRollBytes); 769 } 770 rollingSums[s]->RollForwardSeveral(lowBuffer + lowRollBytes, highBuffer, scanSizes[s], thisCarry - lowRollBytes); 590 771 } 772 else 773 { 774 // Roll is all in low buffer 775 rollingSums[s]->RollForwardSeveral(lowBuffer, lowBuffer + scanSizes[s], scanSizes[s], thisCarry); 776 } 777 // Either way offset is carry 778 bufferOffset = thisCarry; 779 // Reuce carry by the carry amount actually carried 780 carryOverBytes[s] -= thisCarry; 781 ASSERT(carryOverBytes[s] >= 0); 591 782 } 592 593 if(goodnessOfFit.count(fileOffset)) 783 784 // Loop remaining low buffer bytes, taking a checksum at each offset 785 while (bufferOffset < maxScanBlockSize) 594 786 { 595 tmp = goodnessOfFit[fileOffset]; 596 } 597 else 598 { 599 tmp = 0; 600 } 787 // Look for larger size matches at this offset 788 int32_t bestFitSoFarSize = 0; 601 789 602 if(tmp >= Sizes[s]) 603 { 604 // Skip over bigger ready-matched blocks completely 605 rollOverInitialBytes = tmp; 606 int spaceLeft = bytesInEndings - offset; 607 int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes; 608 609 rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll); 610 611 offset += thisRoll; 612 fileOffset += thisRoll; 613 rollOverInitialBytes -= thisRoll; 614 615 if(rollOverInitialBytes) 790 if(pfitBitmap[bufferOffset >> 3] & (1 << (bufferOffset & 0x7))) 616 791 { 617 goto refresh;792 bestFitSoFarSize = goodnessOfFit[bufferFileOffset + bufferOffset]; 618 793 } 619 }620 794 621 while(offset < bytesInEndings) 622 { 623 // Is current checksum in hash list? 624 uint16_t hash = rolling.GetComponentForHashing(); 625 if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s])) 795 if(bestFitSoFarSize >= scanSizes[s]) 626 796 { 627 if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks)) 797 // Roll can be bigger than maxScanBlockSize because first matching 798 // phase might have used bigger blocks than the max scan size 799 int32_t thisRoll = bestFitSoFarSize; 800 int32_t remainderRoll = 0; 801 if(thisRoll > maxScanBlockSize) 628 802 { 629 BOX_TRACE("Found block match for " << hash << " of " << Sizes[s] << " bytes at offset " << fileOffset); 630 goodnessOfFit[fileOffset] = Sizes[s]; 803 thisRoll = maxScanBlockSize; 804 remainderRoll = bestFitSoFarSize - thisRoll; 805 } 631 806 632 // Block matched, roll the checksum forward to the next block without doing 633 // any more comparisons, because these are pointless (as any more matches will be ignored when 634 // the recipe is generated) and just take up valuable processor time. Edge cases are 635 // especially nasty, using huge amounts of time and memory. 636 int skip = Sizes[s]; 637 if(offset < bytesInEndings && skip > 0) 638 { 639 int spaceLeft = bytesInEndings - offset; 640 int thisRoll = (skip > spaceLeft) ? spaceLeft : skip; 807 // Roll forward in the lower buffer. This will either exhaust the total roll or will 808 // push the rolling checksum so that it is against the end of the buffer. 809 int32_t lowRoll = ((maxScanBlockSize - bufferOffset - scanSizes[s]) > thisRoll) ? thisRoll : (maxScanBlockSize - bufferOffset - scanSizes[s]); 641 810 642 rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll); 811 if(lowRoll > 0) 812 { 813 rollingSums[s]->RollForwardSeveral(lowBuffer + bufferOffset, lowBuffer + bufferOffset + scanSizes[s], scanSizes[s], lowRoll); 814 bufferOffset += lowRoll; 815 thisRoll -= lowRoll; 816 } 643 817 644 offset += thisRoll; 645 fileOffset += thisRoll; 646 skip -= thisRoll; 647 } 648 // Not all the bytes necessary will have been skipped, so get them 649 // skipped after the next block is loaded. 650 rollOverInitialBytes = skip; 651 652 // End this loop, so the final byte isn't used again 653 break; 654 } 655 else 818 if(thisRoll < 1) 656 819 { 657 BOX_TRACE("False alarm match for " << hash << " of " << Sizes[s] << " bytes at offset " << fileOffset); 820 ASSERT(remainderRoll == 0); 821 continue; // Back around to bufferOffset loop 658 822 } 659 823 660 int64_t NumBlocksFound = static_cast<int64_t>( 661 rFoundBlocks.size()); 662 int64_t MaxBlocksFound = NumBlocks * 663 BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE; 664 665 if(NumBlocksFound > MaxBlocksFound) 824 // Roll high/low section. Our roll here will either exhaust the total or will 825 // leave the rolling checksum positioned a offset zero in the high buffer 826 int32_t lowHighRoll = ((maxScanBlockSize - bufferOffset) > thisRoll) ? thisRoll : (maxScanBlockSize - bufferOffset); 827 ASSERT(lowHighRoll > 0); 828 rollingSums[s]->RollForwardSeveral(lowBuffer + bufferOffset, highBuffer + (scanSizes[s] - (maxScanBlockSize - bufferOffset)), scanSizes[s], lowHighRoll); 829 bufferOffset += lowHighRoll; 830 carryOverBytes[s] = (thisRoll - lowHighRoll) + remainderRoll; 831 ASSERT(carryOverBytes[s] >= 0); 832 continue; // Back around to bufferOffset loop 833 } 834 835 // Look for block match at this size 836 uint16_t hashValue = rollingSums[s]->GetComponentForHashing(); 837 if(hashTables[s][hashValue] != 0) 838 { 839 int32_t lowSize = maxScanBlockSize - bufferOffset; 840 if(lowSize > scanSizes[s]) lowSize = scanSizes[s]; 841 int32_t highSize = scanSizes[s] - lowSize; 842 if(SecondStageMatch(hashTables[s][hashValue], 843 rollingSums[s], 844 lowBuffer + bufferOffset, lowSize, 845 highBuffer, highSize, 846 scanSizes[s], 847 bufferFileOffset + bufferOffset, 848 pIndex, rFoundBlocks)) 666 849 { 667 abortSearch = true; 668 break; 850 #ifndef NDEBUG 851 BOX_TRACE("Found block match for hash " << hashValue << " of size " << scanSizes[s] << " at offset " << bufferFileOffset + bufferOffset); 852 #endif 853 // Update best fit so far 854 ASSERT((goodnessOfFit.count(bufferFileOffset + bufferOffset) == 0) || (goodnessOfFit[bufferFileOffset + bufferOffset] < scanSizes[s])); 855 goodnessOfFit[bufferFileOffset + bufferOffset] = scanSizes[s]; 856 pfitBitmap[bufferOffset >> 3] |= (1 << (bufferOffset & 0x7)); 857 858 // We've found a match, don't scan anymore if we don't expect more blocks of this size 859 --scanSizesCount[s]; 860 if(scanSizesCount[s] < 1) 861 { 862 scanThisSize[s] = false; 863 #ifndef NDEBUG 864 BOX_TRACE("Short-circuit size " << scanSizes[s]); 865 #endif 866 } 867 868 // Roll so we don't bother diffing the remainder of this block, see bestFitSoFarSize 869 // roll above for an explanation of the logic 870 int32_t thisRoll = scanSizes[s]; 871 int32_t remainderRoll = 0; 872 if(thisRoll > maxScanBlockSize) 873 { 874 thisRoll = maxScanBlockSize; 875 remainderRoll = bestFitSoFarSize - thisRoll; 876 } 877 int32_t lowRoll = ((maxScanBlockSize - bufferOffset - scanSizes[s]) > thisRoll) ? thisRoll : (maxScanBlockSize - bufferOffset - scanSizes[s]); 878 if(lowRoll > 0) 879 { 880 rollingSums[s]->RollForwardSeveral(lowBuffer + bufferOffset, lowBuffer + bufferOffset + scanSizes[s], scanSizes[s], lowRoll); 881 bufferOffset += lowRoll; 882 thisRoll -= lowRoll; 883 } 884 if(thisRoll < 1) 885 { 886 ASSERT(remainderRoll == 0); 887 continue; // Back around to bufferOffset loop 888 } 889 int32_t lowHighRoll = ((maxScanBlockSize - bufferOffset) > thisRoll) ? thisRoll : (maxScanBlockSize - bufferOffset); 890 ASSERT(lowHighRoll > 0); 891 rollingSums[s]->RollForwardSeveral(lowBuffer + bufferOffset, highBuffer + (scanSizes[s] - (maxScanBlockSize - bufferOffset)), scanSizes[s], lowHighRoll); 892 bufferOffset += lowHighRoll; 893 carryOverBytes[s] = (thisRoll - lowHighRoll) + remainderRoll; 894 ASSERT(carryOverBytes[s] >= 0); 895 continue; // Back around to bufferOffset loop 669 896 } 670 897 } 671 898 672 // Roll checksum forward 673 rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]); 674 675 // Increment offsets 676 ++offset; 677 ++fileOffset; 899 // Roll one byte, either low or low/high depending on current offset 900 if((bufferOffset + scanSizes[s]) >= maxScanBlockSize) 901 { 902 rollingSums[s]->RollForward(lowBuffer[bufferOffset], highBuffer[scanSizes[s] - (maxScanBlockSize - bufferOffset)], scanSizes[s]); 903 } 904 else 905 { 906 rollingSums[s]->RollForward(lowBuffer[bufferOffset], lowBuffer[bufferOffset + scanSizes[s]], scanSizes[s]); 907 } 908 ++bufferOffset; 678 909 } 910 911 // Check if we've found too much 912 int64_t NumBlocksFound = static_cast<int64_t>(rFoundBlocks.size()); 913 int64_t MaxBlocksFound = NumBlocks * BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE; 914 if(NumBlocksFound > MaxBlocksFound) 915 { 916 abortSearch = true; 917 break; 918 } 919 } 920 // If we're aborting, pass through 921 if(abortSearch) break; 922 923 // Swap buffers so high is low and the go back around 924 uint8_t *swap = lowBuffer; 925 lowBuffer = highBuffer; 926 ASSERT((lowBufferBytes == highBufferBytes) && (lowBufferBytes == maxScanBlockSize)); 927 bufferFileOffset += maxScanBlockSize; 928 highBuffer = swap; 929 highBufferBytes = rFile.Read(highBuffer, maxScanBlockSize); 930 931 } // End of maxScanBlockSize reads 932 933 // What remains is either: 934 // - An abort or timeout 935 // - A low buffer with less than maxScanBlockSize bytes and an empty high buffer 936 // (Potentially zero low bytes for a zero-length file) 937 // - A maxScanBlockSize low buffer and a maxScanBlockSize or less high buffer 938 // 939 if(!abortSearch && !maximumDiffingTime.HasExpired() && (lowBufferBytes > 0)) 940 { 941 // We're going to repeat the logic in the previous read loop, but have 942 // to be careful not to walk off the end of the available data. To make this 943 // a lot less convoluted, we're just going to copy to consolidate buffers. 944 // This is cheating a bit, but worth the simplicity. 945 ASSERT((lowBufferBytes < maxScanBlockSize) ? (highBufferBytes == 0) : true); 946 ASSERT((highBufferBytes != 0) ? (lowBufferBytes == maxScanBlockSize) : true); 947 uint8_t *endingBuffer = lowBuffer; 948 if(highBufferBytes != 0) 949 { 950 ::memcpy(endingBuffer + maxScanBlockSize, highBuffer, highBufferBytes); 951 } 952 int32_t endingBytes = lowBufferBytes + highBufferBytes; 953 954 // Send keep alive 955 if(pDiffTimer) pDiffTimer->DoKeepAlive(); 956 957 // std::map still not winning 958 ::memset(pfitBitmap, 0, maxScanBlockSize * 2 / 8); 959 for(std::map<int64_t, int32_t>::const_iterator i(goodnessOfFit.lower_bound(bufferFileOffset)); i != goodnessOfFit.end(); ++i) 960 { 961 pfitBitmap[(i->first - bufferFileOffset) >> 3] |= (1 << ((i->first - bufferFileOffset) & 0x7)); 962 } 963 964 // Walk all block sizes in block-probability order 965 for(int s = 0; s < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++s) 966 { 967 // If there is no rolling checksum at this index, skip its either 968 // a zero size or the file was too small 969 if(!scanThisSize[s] || (rollingSums[s] == 0)) continue; 679 970 680 if(abortSearch) break; 971 // If there's not enough data at the end, skip this block size 972 if(endingBytes < scanSizes[s]) continue; 973 974 #ifndef NDEBUG 975 BOX_TRACE("Diff block size " << scanSizes[s] << " at file ending offset " << bufferFileOffset); 976 #endif 681 977 682 refresh: 683 // Finished? 684 if(bytesInEndings != Sizes[s]) 978 // Offset of this block buffer 979 int32_t bufferOffset = 0; 980 981 // For each block size we can only walk so far safely 982 int32_t maxSafeOffset = endingBytes - scanSizes[s]; 983 984 // Roll carry over 985 if(carryOverBytes[s] != 0) 685 986 { 686 // No more data in file -- check the final block 687 // (Do a copy and paste of 5 lines of code instead of introducing a comparison for 688 // each byte of the file) 689 uint16_t hash = rolling.GetComponentForHashing(); 690 if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s])) 987 ASSERT(carryOverBytes[s] > 0); 988 // Don't walk off the end 989 if(carryOverBytes[s] > maxSafeOffset) continue; 990 // Roll 991 rollingSums[s]->RollForwardSeveral(endingBuffer, endingBuffer + scanSizes[s], scanSizes[s], carryOverBytes[s]); 992 bufferOffset = carryOverBytes[s]; 993 carryOverBytes[s] = 0; // Not really needed again, but be clean 994 } 995 996 // Loop remaining end bytes, we must walk to exactly and then break the loop 997 while (bufferOffset <= maxSafeOffset) 998 { 999 // Look for larger size matches at this offset 1000 int32_t bestFitSoFarSize = 0; 1001 if(pfitBitmap[bufferOffset >> 3] & (1 << (bufferOffset & 0x7))) 691 1002 { 692 if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks)) 1003 bestFitSoFarSize = goodnessOfFit[bufferFileOffset + bufferOffset]; 1004 } 1005 if(bestFitSoFarSize >= scanSizes[s]) { 1006 // Space for this roll? 1007 if((bufferOffset + bestFitSoFarSize) > maxSafeOffset) break; // Kill byte walk loop 1008 // Roll 1009 rollingSums[s]->RollForwardSeveral(endingBuffer + bufferOffset, endingBuffer + bufferOffset + scanSizes[s], scanSizes[s], bestFitSoFarSize); 1010 bufferOffset += bestFitSoFarSize; 1011 continue; // Back around to byte read loop 1012 } 1013 1014 // Look for block match at this size 1015 uint16_t hashValue = rollingSums[s]->GetComponentForHashing(); 1016 if(hashTables[s][hashValue] != 0) 1017 { 1018 if(SecondStageMatch(hashTables[s][hashValue], 1019 rollingSums[s], 1020 endingBuffer + bufferOffset, 1021 scanSizes[s], 1022 0, 0, scanSizes[s], 1023 bufferFileOffset + bufferOffset, 1024 pIndex, rFoundBlocks)) 693 1025 { 694 goodnessOfFit[fileOffset] = Sizes[s]; 1026 #ifndef NDEBUG 1027 BOX_TRACE("Found block match for hash " << hashValue << " of size " << scanSizes[s] << " at offset " << bufferFileOffset + bufferOffset); 1028 #endif 1029 // Update best fit so far 1030 ASSERT((goodnessOfFit.count(bufferFileOffset + bufferOffset) == 0) || (goodnessOfFit[bufferFileOffset + bufferOffset] < scanSizes[s])); 1031 goodnessOfFit[bufferFileOffset + bufferOffset] = scanSizes[s]; 1032 pfitBitmap[bufferOffset >> 3] |= (1 << (bufferOffset & 0x7)); 1033 1034 // We've found a match, don't scan anymore if we don't expect more blocks of this size 1035 --scanSizesCount[s]; 1036 if(scanSizesCount[s] < 1) 1037 { 1038 scanThisSize[s] = false; 1039 #ifndef NDEBUG 1040 BOX_TRACE("Short-circuit size " << scanSizes[s]); 1041 #endif 1042 } 1043 1044 // Roll over this matched block 1045 if((bufferOffset + scanSizes[s]) > maxSafeOffset) break; // Kill byte walk loop 1046 rollingSums[s]->RollForwardSeveral(endingBuffer + bufferOffset, endingBuffer + bufferOffset + scanSizes[s], scanSizes[s], scanSizes[s]); 1047 bufferOffset += scanSizes[s]; 1048 continue; // Back around to byte read loop 695 1049 } 696 1050 } 697 698 // finish 699 break; 1051 1052 // Abort check 1053 int64_t NumBlocksFound = static_cast<int64_t>(rFoundBlocks.size()); 1054 int64_t MaxBlocksFound = NumBlocks * BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE; 1055 if(NumBlocksFound > MaxBlocksFound) 1056 { 1057 abortSearch = true; 1058 break; 1059 } 1060 1061 // Because byte loop conditional can equal maxSafeOffset we must not read off 1062 // end here 1063 if(bufferOffset == maxSafeOffset) break; // Done with reading bytes 1064 rollingSums[s]->RollForward(endingBuffer[bufferOffset], endingBuffer[bufferOffset + scanSizes[s]], scanSizes[s]); 1065 ++bufferOffset; 700 1066 } 701 702 // Switch buffers, reset offset 703 beginnings = endings; 704 endings = (beginnings == pbuffer0)?(pbuffer1):(pbuffer0); // ie the other buffer 705 offset = 0; 706 707 // And count the blocks which have been done 708 ++fileBlockNumber; 1067 if(abortSearch) break; 1068 // NOTE: Nothing else can go here without restructuring use of "break" in the byte read loop above 709 1069 } 710 711 if(abortSearch) break;712 1070 } 713 1071 714 // Free buffers and hash table 1072 // Clean up 1073 ::free(pbuffer0); 1074 pbuffer0 = 0; 715 1075 ::free(pbuffer1); 716 1076 pbuffer1 = 0; 717 ::free(pbuffer0); 718 pbuffer0 = 0; 719 ::free(phashTable); 720 phashTable = 0; 1077 ::free(pfitBitmap); 1078 pfitBitmap = 0; 1079 for(int z = 0; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z) 1080 { 1081 if(hashTables[z] != 0) 1082 { 1083 free(hashTables[z]); 1084 hashTables[z] = 0; 1085 } 1086 if(rollingSums[z] != 0) 1087 { 1088 delete rollingSums[z]; 1089 rollingSums[z] = 0; 1090 } 1091 } 721 1092 } 722 1093 catch(...) 723 1094 { 724 // Cleanup and throw 1095 // Cleanup and rethrow 1096 if(pbuffer0 != 0) ::free(pbuffer0); 725 1097 if(pbuffer1 != 0) ::free(pbuffer1); 726 if(pbuffer0 != 0) ::free(pbuffer0); 727 if(phashTable != 0) ::free(phashTable); 1098 if(pfitBitmap != 0) ::free(pfitBitmap); 1099 for(int z = 0; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z) 1100 { 1101 if(hashTables[z] != 0) free(hashTables[z]); 1102 } 1103 for(int z = 0; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z) 1104 { 1105 if(rollingSums[z] != 0) delete rollingSums[z]; 1106 } 728 1107 throw; 729 1108 } 730 1109 731 1110 #ifndef NDEBUG 732 if(BackupStoreFile::TraceDetailsOfDiffProcess) 1111 dumpDiffList: 1112 // Trace out the found blocks in debug mode 1113 BOX_TRACE("Diff: list of found blocks"); 1114 BOX_TRACE("======== ======== ======== ========"); 1115 BOX_TRACE(" Offset BlkIdx Size Movement"); 1116 for(std::map<int64_t, int64_t>::const_iterator i(rFoundBlocks.begin()); i != rFoundBlocks.end(); ++i) 733 1117 { 734 // Trace out the found blocks in debug mode 735 BOX_TRACE("Diff: list of found blocks"); 736 BOX_TRACE("======== ======== ======== ========"); 737 BOX_TRACE(" Offset BlkIdx Size Movement"); 738 for(std::map<int64_t, int64_t>::const_iterator i(rFoundBlocks.begin()); i != rFoundBlocks.end(); ++i) 1118 int64_t orgLoc = 0; 1119 for(int64_t b = 0; b < i->second; ++b) 739 1120 { 740 int64_t orgLoc = 0; 741 for(int64_t b = 0; b < i->second; ++b) 742 { 743 orgLoc += pIndex[b].mSize; 744 } 745 BOX_TRACE(std::setw(8) << i->first << " " << 746 std::setw(8) << i->second << " " << 747 std::setw(8) << pIndex[i->second].mSize << 748 " " << 749 std::setw(8) << (i->first - orgLoc)); 1121 orgLoc += pIndex[b].mSize; 750 1122 } 751 BOX_TRACE("======== ======== ======== ========"); 1123 BOX_TRACE(std::setw(8) << i->first << " " << 1124 std::setw(8) << i->second << " " << 1125 std::setw(8) << pIndex[i->second].mSize << 1126 " " << 1127 std::setw(8) << (i->first - orgLoc)); 752 1128 } 1129 BOX_TRACE("======== ======== ======== ========"); 753 1130 #endif 754 1131 } 755 1132 … … 781 1158 { 782 1159 //BOX_TRACE("Another hash entry for " << hash << " found"); 783 1160 // Yes -- need to set the pointer in this entry to the current entry to build the linked list 1161 // This is safe because there is only one hash table per block size and block sizes 1162 // are unique. 784 1163 pIndex[b].mpNextInHashList = pHashTable[hash]; 785 1164 } 786 1165 … … 790 1169 } 791 1170 } 792 1171 793 794 1172 // -------------------------------------------------------------------------- 795 1173 // 796 1174 // Function 797 1175 // Name: static bool SecondStageMatch(xxx) 798 // Purpose: When a match in the hash table is found, scan for second stage match using strong checksum. 1176 // Purpose: When a match in the hash table is found, scan for 1177 // second stage match using strong checksum. 799 1178 // Created: 14/1/04 800 1179 // 801 1180 // -------------------------------------------------------------------------- 802 static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, 803 int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks) 1181 static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, 1182 RollingChecksum *pFastSum, uint8_t *pLow, int32_t LowSize, 1183 uint8_t *pHigh, int32_t HighSize, int32_t BlockSize, 1184 int64_t FileOffset, BlocksAvailableEntry *pIndex, 1185 std::map<int64_t, int64_t> &rFoundBlocks) 804 1186 { 805 1187 // Check parameters 806 ASSERT(pBeginnings != 0); 807 ASSERT(pEndings != 0); 808 ASSERT(Offset >= 0); 1188 ASSERT(pFirstInHashList != 0); 1189 ASSERT(pFastSum != 0); 1190 ASSERT(pLow != 0); 1191 ASSERT(LowSize != 0); 1192 ASSERT((HighSize > 0) ? (pHigh != 0) : true); 809 1193 ASSERT(BlockSize > 0); 810 ASSERT( pFirstInHashList != 0);1194 ASSERT(BlockSize == (LowSize + HighSize)); 811 1195 ASSERT(pIndex != 0); 812 1196 813 1197 #ifndef NDEBUG 814 uint16_t DEBUG_Hash = fastSum.GetComponentForHashing();1198 uint16_t DEBUG_Hash = pFastSum->GetComponentForHashing(); 815 1199 #endif 816 uint32_t Checksum = fastSum.GetChecksum();1200 uint32_t Checksum = pFastSum->GetChecksum(); 817 1201 818 1202 // Before we go to the expense of the MD5, make sure it's a darn good match on the checksum we already know. 819 1203 BlocksAvailableEntry *scan = pFirstInHashList; … … 834 1218 835 1219 // Calculate the strong MD5 digest for this block 836 1220 MD5Digest strong; 837 // Add the data from the beginnings 838 strong.Add(pBeginnings + Offset, BlockSize - Offset); 839 // Add any data from the endings 840 if(Offset > 0) 841 { 842 strong.Add(pEndings, Offset); 843 } 1221 strong.Add(pLow, LowSize); 1222 if(HighSize > 0) strong.Add(pHigh, HighSize); 844 1223 strong.Finish(); 845 1224 846 1225 // Then go through the entries in the hash list, comparing with the strong digest calculated 847 1226 scan = pFirstInHashList; 848 //BOX_TRACE("second stage match"); 1227 849 1228 while(scan != 0) 850 1229 { 851 1230 //BOX_TRACE("scan size " << scan->mSize << … … 853 1232 // ", hash " << Hash); 854 1233 ASSERT(scan->mSize == BlockSize); 855 1234 ASSERT(RollingChecksum::ExtractHashingComponent(scan->mWeakChecksum) == DEBUG_Hash); 856 1235 857 1236 // Compare? 858 1237 if(strong.DigestMatches(scan->mStrongChecksum)) 859 1238 { 860 //BOX_TRACE("Match!\n");861 1239 // Found! Add to list of found blocks... 862 int64_t fileOffset = (FileBlockNumber * BlockSize) + Offset;863 1240 int64_t blockIndex = (scan - pIndex); // pointer arthmitic is frowned upon. But most efficient way of doing it here -- alternative is to use more memory 864 1241 865 1242 // We do NOT search for smallest blocks first, as this code originally assumed. 866 1243 // To prevent this from potentially overwriting a better match, the caller must determine 867 1244 // the relative "goodness" of any existing match and this one, and avoid the call if it 868 1245 // could be detrimental. 869 rFoundBlocks[ fileOffset] = blockIndex;1246 rFoundBlocks[FileOffset] = blockIndex; 870 1247 871 1248 // No point in searching further, report success 872 1249 return true; 873 1250 } 874 875 1251 // Next 876 1252 scan = scan->mpNextInHashList; 877 1253 } … … 917 1293 rRecipe.push_back(instruction); 918 1294 919 1295 #ifndef NDEBUG 920 if(BackupStoreFile::TraceDetailsOfDiffProcess) 921 { 922 BOX_TRACE("Diff: Default recipe generated, " << 923 SizeOfInputFile << " bytes of file"); 924 } 1296 BOX_TRACE("Diff: Default recipe generated, " << 1297 SizeOfInputFile << " bytes of file"); 925 1298 #endif 926 1299 927 1300 // Don't do anything … … 1016 1389 BOX_TRACE("Diff: " << 1017 1390 debug_NewBytesFound << " new bytes found, " << 1018 1391 debug_OldBlocksUsed << " old blocks used"); 1019 if(BackupStoreFile::TraceDetailsOfDiffProcess) 1392 1393 if (true) 1020 1394 { 1021 1395 BOX_TRACE("Diff: Recipe generated (size " << rRecipe.size()); 1022 1396 BOX_TRACE("======== ========= ========"); -
test/bbackupd/testbbackupd.cpp
1002 1002 // before any matching blocks could be found. 1003 1003 intercept_setup_delay("testfiles/TestDir1/spacetest/f1", 1004 1004 0, 4000, SYS_read, 1); 1005 pid = start_internal_daemon(); 1006 intercept_clear_setup(); 1005 { 1006 Timers::Init(); 1007 BackupDaemon bbackupd; 1008 bbackupd.Configure("testfiles/bbackupd.conf"); 1009 bbackupd.InitCrypto(); 1007 1010 1008 fd = open("testfiles/TestDir1/spacetest/f1", O_WRONLY); 1009 TEST_THAT(fd > 0); 1010 // write again, to update the file's timestamp 1011 TEST_EQUAL(sizeof(buffer), write(fd, buffer, sizeof(buffer)), 1012 "Buffer write"); 1013 TEST_THAT(close(fd) == 0); 1011 fd = open("testfiles/TestDir1/spacetest/f1", O_WRONLY); 1012 TEST_THAT(fd > 0); 1013 // write again, to update the file's timestamp 1014 TEST_EQUAL(1, write(fd, "z", 1), "Buffer write"); 1015 TEST_THAT(close(fd) == 0); 1014 1016 1015 wait_for_backup_operation(); 1016 // can't test whether intercept was triggered, because 1017 // it's in a different process. 1018 // TEST_THAT(intercept_triggered()); 1019 TEST_THAT(stop_internal_daemon(pid)); 1017 // wait long enough to put file into sync window 1018 wait_for_operation(5); 1020 1019 1020 bbackupd.RunSyncNow(); 1021 TEST_THAT(intercept_triggered()); 1022 intercept_clear_setup(); 1023 Timers::Cleanup(); 1024 } 1025 1021 1026 // check that the diff was aborted, i.e. upload was not a diff 1022 1027 found1 = false; 1023 1028
