| 46 | | static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]); |
| 47 | | static void SearchForMatchingBlocks(IOStream &rFile, |
| 48 | | std::map<int64_t, int64_t> &rFoundBlocks, BlocksAvailableEntry *pIndex, |
| 49 | | int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES], |
| 50 | | DiffTimer *pDiffTimer); |
| | 40 | static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks, BlocksAvailableEntry *pIndex, int64_t NumBlocks, DiffTimer *pDiffTimer); |
| 52 | | static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber, |
| 53 | | BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks); |
| | 42 | static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum *pFastSum, |
| | 43 | uint8_t *pLow, int32_t LowSize, uint8_t *pHigh, int32_t HighSize, int32_t BlockSize, int64_t FileOffset, |
| | 44 | BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks); |
| 385 | | // Array for collecting sizes |
| 386 | | std::map<int32_t, int64_t> foundSizes; |
| | 368 | // Flag to abort the run, if too many blocks are found or the diffing |
| | 369 | // timeout expires |
| | 370 | bool abortSearch = false; |
| | 371 | |
| | 372 | // Buffers used during both phases of search |
| | 373 | uint8_t *pbuffer0 = 0; |
| | 374 | uint8_t *pbuffer1 = 0; |
| | 375 | |
| | 376 | // Track offsets that already have block matches. We don't really care |
| | 377 | // if its sorted, and this actually produces a performance issue, so |
| | 378 | // see pfitBitmap for a workaround |
| | 379 | std::map<int64_t, int32_t> goodnessOfFit; |
| 388 | | // Run through blocks and make a count of the entries |
| 389 | | for(int64_t b = 0; b < NumBlocks; ++b) |
| | 381 | // Collect sizes that aren't found in the file at their old offset |
| | 382 | std::map<int32_t, int64_t> unmatchedSizes; |
| | 383 | |
| | 384 | // Our arrays of block sizes during second search pass |
| | 385 | int32_t scanSizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; |
| | 386 | int64_t scanSizesCount[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; |
| | 387 | int32_t maxScanBlockSize = 0; |
| | 388 | ::memset(scanSizes, 0, (sizeof(int32_t) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES)); |
| | 389 | ::memset(scanSizesCount, 0, (sizeof(int64_t) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES)); |
| | 390 | |
| | 391 | // We need to keep separate rolling checksums for each block size in second search |
| | 392 | RollingChecksum *rollingSums[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; |
| | 393 | ::memset(rollingSums, 0, (sizeof(RollingChecksum *) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES)); |
| | 394 | |
| | 395 | // And a hash lookup table per block size in seacond search |
| | 396 | BlocksAvailableEntry **hashTables[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; |
| | 397 | ::memset(hashTables, 0, (sizeof(BlocksAvailableEntry **) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES)); |
| | 398 | |
| | 399 | // We allow second search pass to short-circuit off rare blocks |
| | 400 | bool scanThisSize[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; |
| | 401 | ::memset(scanThisSize, 0, (sizeof(bool) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES)); |
| | 402 | |
| | 403 | // During block read we have a bitmap of prefit locations to avoid std::map |
| | 404 | // performance problems |
| | 405 | uint8_t *pfitBitmap = 0; |
| | 406 | |
| | 407 | |
| | 408 | // First search pass... |
| | 409 | // |
| | 410 | // For many files (especially large ones) most of the file is unchanged. |
| | 411 | // The RollingChecksum process requires us to read every byte of the file looking for |
| | 412 | // blocks that have moved. However, we can make that process more efficient by |
| | 413 | // quickly rolling over areas that match a different block. We can also use |
| | 414 | // this to eliminate entirely the rolling checksum for block sizes that only exist |
| | 415 | // at one location in the file. |
| | 416 | // |
| | 417 | // Thus we start by looking for blocks that have not moved. Only if a block |
| | 418 | // cannot be found at its previous location do we consider scanning for it by |
| | 419 | // rolling checksum size. |
| | 420 | // |
| | 421 | // This strategy has some disadvantages for files with lots of repeating content |
| | 422 | // that happens to align with our block size, but the reduction in diff time |
| | 423 | // for more typical files is worth it. |
| | 424 | // |
| | 425 | // Note also that in this pass we consider _all_ block sizes (smaller than |
| | 426 | // BACKUP_FILE_MAX_BLOCK_SIZE). Any block size, no matter how small or rare |
| | 427 | // is cheap for us to find in this pass. |
| | 428 | // |
| | 429 | pbuffer0 = (uint8_t *)::malloc(BACKUP_FILE_MAX_BLOCK_SIZE); |
| | 430 | try |
| 394 | | // Find entry? |
| 395 | | std::map<int32_t, int64_t>::const_iterator f(foundSizes.find(pIndex[b].mSize)); |
| 396 | | if(f != foundSizes.end()) |
| | 443 | // Check diffing timeout |
| | 444 | if(maximumDiffingTime.HasExpired()) |
| | 445 | { |
| | 446 | ASSERT(pDiffTimer != NULL); |
| | 447 | BOX_INFO("MaximumDiffingTime reached - suspending file diff"); |
| | 448 | abortSearch = true; |
| | 449 | break; |
| | 450 | } |
| | 451 | |
| | 452 | // Send keep alive |
| | 453 | if(pDiffTimer) pDiffTimer->DoKeepAlive(); |
| | 454 | |
| | 455 | // Skip blocks too large for our buffer |
| | 456 | if(pIndex[b].mSize > BACKUP_FILE_MAX_BLOCK_SIZE) { |
| | 457 | fileOffset += pIndex[b].mSize; |
| | 458 | continue; |
| | 459 | } |
| | 460 | |
| | 461 | // Have to guard the seek operation, it could throw. We don't know size of the |
| | 462 | // current file, and checking is pointless anyway since there's a race. |
| | 463 | // In reality on Unix this is implemented with lseek which will mean we can |
| | 464 | // seek past the EOF, but I don't want to make assumptions about Win32. |
| | 465 | int32_t readSize = 0; |
| | 466 | try |
| | 467 | { |
| | 468 | rFile.Seek(fileOffset, IOStream::SeekType_Absolute); |
| | 469 | readSize = rFile.Read(pbuffer0, pIndex[b].mSize); |
| | 470 | } |
| | 471 | catch(BoxException &e) |
| | 472 | { |
| | 473 | if(e.GetType() != CommonException::ExceptionType || e.GetSubType() != CommonException::OSFileError) |
| | 474 | { |
| | 475 | // Not what we expected, rethrow |
| | 476 | throw; |
| | 477 | } |
| | 478 | } |
| | 479 | |
| | 480 | // Check for a match |
| | 481 | bool blockMatched = false; |
| | 482 | if(readSize == pIndex[b].mSize) |
| 398 | | // Increment existing entry |
| 399 | | foundSizes[pIndex[b].mSize] = foundSizes[pIndex[b].mSize] + 1; |
| | 484 | // We don't have a rolling checksum to this point, so all we can do is MD5. If you |
| | 485 | // worry this is expensive just remember that prior versions of this code |
| | 486 | // re-read the file BACKUP_FILE_DIFF_MAX_BLOCK_SIZES times and calculated |
| | 487 | // rolling checksums every time. |
| | 488 | MD5Digest strong; |
| | 489 | strong.Add(pbuffer0, pIndex[b].mSize); |
| | 490 | strong.Finish(); |
| | 491 | |
| | 492 | // Do we have a match? |
| | 493 | if(strong.DigestMatches(pIndex[b].mStrongChecksum)) |
| | 494 | { |
| | 495 | #ifndef NDEBUG |
| | 496 | BOX_TRACE("Found unchanged block of size " << pIndex[b].mSize << " at offset " << fileOffset); |
| | 497 | #endif |
| | 498 | rFoundBlocks[fileOffset] = b; |
| | 499 | goodnessOfFit[fileOffset] = pIndex[b].mSize; |
| | 500 | blockMatched = true; |
| | 501 | } |
| 412 | | // Find the position of the size in the array |
| | 539 | #ifndef NDEBUG |
| | 540 | goto dumpDiffList; |
| | 541 | #endif |
| | 542 | return; |
| | 543 | } |
| | 544 | |
| | 545 | |
| | 546 | // Second search pass... |
| | 547 | // |
| | 548 | // In our second phase, having matched all unchanged blocks we now need |
| | 549 | // to scan for moved blocks. This involves looping across all unmatched |
| | 550 | // block sizes and using the rolling checksum to look for relocations. To keep |
| | 551 | // this from being too expensive we cap at BACKUP_FILE_DIFF_MAX_BLOCK_SIZES |
| | 552 | // for the number of sizes to scan. We also scan for the blocks in order of |
| | 553 | // their relative probability, a block size that occurs frequently is scanned |
| | 554 | // first. |
| | 555 | // |
| | 556 | |
| | 557 | // Loop all sizes inserting higher usages into the array |
| | 558 | for(std::map<int32_t, int64_t>::const_iterator i(unmatchedSizes.begin()); i != unmatchedSizes.end(); ++i) |
| | 559 | { |
| | 560 | // TODO: Scanning for any block size isn't cheap, and realistically in many cases |
| | 561 | // it would be less expensive to upload a few thousand bytes rather than do the scan. |
| | 562 | // Here would be a good place to filter block sizes that aren't worth the effort |
| | 563 | // once a suitable set of heuristics is found. |
| | 564 | |
| 446 | | } |
| 447 | | |
| 448 | | |
| 449 | | |
| 450 | | // -------------------------------------------------------------------------- |
| 451 | | // |
| 452 | | // Function |
| 453 | | // Name: static SearchForMatchingBlocks(IOStream &, std::map<int64_t, int64_t> &, BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]) |
| 454 | | // Purpose: Find the matching blocks within the file. |
| 455 | | // Created: 12/1/04 |
| 456 | | // |
| 457 | | // -------------------------------------------------------------------------- |
| 458 | | static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks, |
| 459 | | BlocksAvailableEntry *pIndex, int64_t NumBlocks, |
| 460 | | int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES], DiffTimer *pDiffTimer) |
| 461 | | { |
| 462 | | Timer maximumDiffingTime(0); |
| 469 | | std::map<int64_t, int32_t> goodnessOfFit; |
| 470 | | |
| 471 | | // Allocate the hash lookup table |
| 472 | | BlocksAvailableEntry **phashTable = (BlocksAvailableEntry **)::malloc(sizeof(BlocksAvailableEntry *) * (64*1024)); |
| 473 | | |
| 474 | | // Choose a size for the buffer, just a little bit more than the maximum block size |
| 475 | | int32_t bufSize = Sizes[0]; |
| 476 | | for(int z = 1; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z) |
| 477 | | { |
| 478 | | if(Sizes[z] > bufSize) bufSize = Sizes[z]; |
| 479 | | } |
| 480 | | bufSize += 4; |
| 481 | | ASSERT(bufSize > Sizes[0]); |
| 482 | | ASSERT(bufSize > 0); |
| 483 | | if(bufSize > (BACKUP_FILE_MAX_BLOCK_SIZE + 1024)) |
| 484 | | { |
| 485 | | THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) |
| 486 | | } |
| | 611 | // Allocate two buffers we'll toggle between at the max scan block size |
| | 612 | // There sizes are doubled to make final block at the end easier (we're cheating) |
| | 613 | pbuffer0 = (uint8_t *)::malloc(maxScanBlockSize * 2); |
| | 614 | pbuffer1 = (uint8_t *)::malloc(maxScanBlockSize * 2); |
| | 615 | |
| | 616 | // Allocate a bitmap buffer to optimize goodnessOfFit access |
| | 617 | pfitBitmap = (uint8_t *)::malloc(maxScanBlockSize * 2 / 8); |
| 504 | | // Flag to abort the run, if too many blocks are found -- avoid using |
| 505 | | // huge amounts of processor time when files contain many similar blocks. |
| 506 | | bool abortSearch = false; |
| 507 | | |
| 508 | | // Search for each block size in turn |
| 509 | | // NOTE: Do the smallest size first, so that the scheme for adding |
| 510 | | // entries in the found list works as expected and replaces smallers block |
| 511 | | // with larger blocks when it finds matches at the same offset in the file. |
| 512 | | for(int s = BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1; s >= 0; --s) |
| 513 | | { |
| 514 | | ASSERT(Sizes[s] <= bufSize); |
| 515 | | BOX_TRACE("Diff pass " << s << ", for block size " << |
| 516 | | Sizes[s]); |
| 517 | | |
| 518 | | // Check we haven't finished |
| 519 | | if(Sizes[s] == 0) |
| 520 | | { |
| 521 | | // empty entry, try next size |
| 522 | | continue; |
| 523 | | } |
| 524 | | |
| 525 | | // Set up the hash table entries |
| 526 | | SetupHashTable(pIndex, NumBlocks, Sizes[s], phashTable); |
| | 628 | // Shift file position back to beginning |
| | 629 | int64_t bufferFileOffset = 0; |
| | 630 | rFile.Seek(0, IOStream::SeekType_Absolute); |
| 528 | | // Shift file position to beginning |
| 529 | | rFile.Seek(0, IOStream::SeekType_Absolute); |
| 530 | | |
| 531 | | // Read first block |
| 532 | | if(rFile.Read(pbuffer0, Sizes[s]) != Sizes[s]) |
| | 632 | // We're going to be flipping back and forth between two buffers, the low and high |
| | 633 | uint8_t *lowBuffer = pbuffer0; |
| | 634 | int32_t lowBufferBytes = 0; |
| | 635 | uint8_t *highBuffer = pbuffer1; |
| | 636 | int32_t highBufferBytes = 0; |
| | 637 | |
| | 638 | // In some cases we need to carry over reads from prior buffers |
| | 639 | int32_t carryOverBytes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; |
| | 640 | ::memset(carryOverBytes, 0, (sizeof(int32_t) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES)); |
| | 641 | |
| | 642 | // Read the first buffer's worth of data |
| | 643 | lowBufferBytes = rFile.Read(lowBuffer, maxScanBlockSize); |
| | 644 | // Fill the second buffer if appropriate |
| | 645 | if(lowBufferBytes == maxScanBlockSize) |
| | 646 | { |
| | 647 | highBufferBytes = rFile.Read(highBuffer, maxScanBlockSize); |
| | 648 | } |
| | 649 | |
| | 650 | // For every block size, initialize our scan tracking |
| | 651 | for(int z = 0; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z) |
| | 652 | { |
| | 653 | ASSERT(scanSizes[z] <= maxScanBlockSize); |
| | 654 | |
| | 655 | // The sizes array may be mostly empty, in those cases we have no |
| | 656 | // state to maintain. |
| | 657 | if(scanSizes[z] != 0) |
| 534 | | // Size of file too short to match -- do next size |
| 535 | | continue; |
| 536 | | } |
| 537 | | |
| 538 | | // Setup block pointers |
| 539 | | uint8_t *beginnings = pbuffer0; |
| 540 | | uint8_t *endings = pbuffer1; |
| 541 | | int offset = 0; |
| 542 | | |
| 543 | | // Calculate the first checksum, ready for rolling |
| 544 | | RollingChecksum rolling(beginnings, Sizes[s]); |
| 545 | | |
| 546 | | // Then roll, until the file is exhausted |
| 547 | | int64_t fileBlockNumber = 0; |
| 548 | | int64_t fileOffset = 0; |
| 549 | | int rollOverInitialBytes = 0; |
| 550 | | while(true) |
| 551 | | { |
| 552 | | if(maximumDiffingTime.HasExpired()) |
| 553 | | { |
| 554 | | ASSERT(pDiffTimer != NULL); |
| 555 | | BOX_INFO("MaximumDiffingTime reached - " |
| 556 | | "suspending file diff"); |
| 557 | | abortSearch = true; |
| 558 | | break; |
| 559 | | } |
| | 659 | // Mark for scan |
| | 660 | scanThisSize[z] = true; |
| 573 | | int spaceLeft = bytesInEndings - offset; |
| 574 | | int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes; |
| | 674 | rollingSums[z] = new RollingChecksum(lowBuffer, scanSizes[z]); |
| | 675 | } |
| | 676 | } |
| | 677 | } |
| | 678 | |
| | 679 | // Read loop while we can get full maxScanBlockSize reads |
| | 680 | while(highBufferBytes == maxScanBlockSize) |
| | 681 | { |
| | 682 | // Oh happy day! We have maxScanBlockSize * 2 bytes available to us across |
| | 683 | // the two buffers, which means we can walk every block size across all offsets |
| | 684 | // in lowBuffer without any concern about overrunning the data available in highBuffer. |