FileDigester.cs, Version 1.0

/*
  ---------- File filedigester.cs

  Copyright (c) 2003, Andrew Shapira
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions
  are met:

    - Redistributions of source code must retain the above copyright notice,
      this list of conditions and the following disclaimer.

    - Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.

    - Neither the name of Andrew Shapira nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.

  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

  --

  This file contains the "FileDigester" class.  The FileDigester class
  implements the digest method for writing and reading files (see below).

  This file also contains code tests for the FileDigester class.
  The tests are being distributed so that they serve as good examples
  of how to use the FileDigester class.  The test code requires other
  code that is not distributed in order to compile, but it is easy to
  modify for a standalone application.
  
  The most recent version of this implementation should be at
  http://www.onezero.org.

  If you like this software, you could make a small donation
  at http://www.onezero.org/donate.html.

  --
  
  The "digest method" is a way to determine whether the contents of a
  file have been completely written, in the face of possible program
  or system failure before or during writes.  With the digest method,
  files are written as usual, except that a digest of the file contents
  is written at the end of the file.  (The computation of the digest
  does not use the digest itself, of course.)  Files are read as usual,
  except that an extra step is added.  The extra step is to compute the
  digest of the file contents (not including the digest), and compare the
  computed digest to the digest that was read from the end of the file.
  If the digests match, the file was completely written; if the digests
  do not match, the file was not completely written.

  In the digest method, a file is said to be "correct" if and only if
  all of the following three conditions hold:
    - the file exists,
    - the file has a length greater than the digest length, and
    - and the digest of the file contents matches the digest stored at
      the end of the file.

  (We assume that length of the digest is fixed.)

  The FileDigester class uses SHA1 digests.  SHA1 digests are 20 bytes long.

  The term "digest method" in this context is mine and is probably
  not a standard term.  I learned about the method from Ulfar Erlingsson.
*/


using System;
using System.IO;
using System.Security.Cryptography;


namespace ZRiver.Utilities {


class FileDigester {

  ////////////////////
  //////////////////// public members
  ////////////////////

  public FileDigester()
  {
    d_sha1Provider = new SHA1CryptoServiceProvider();
    d_storedDigest = null;
  }

  /*
    Write "contents" to the "path" file, using the digest method.
    If the file already exists, its contents are overwritten.
  */
  public void Write(
      MemoryStream contents // file contents to write (start at Position 0)
    , string path           // path to file to write
  )
  {
    FileStream ost = new FileStream(path, FileMode.Create);
    try {
      Write(contents, ost);
    } finally {
      ost.Close();
    }
  }

  /*
    Write "contents" to the "ost" stream, using the digest method.
    It is up to the caller to close ost.

    It is allowable to call this function when contents.Length == 0;
    doing so will result in output that contains only a digest.
  */
  public void Write(
      MemoryStream contents // file contents to write (start at Position 0)
    , Stream ost            // where to write the file contents and digest
  )
  {
    contents.Position = 0;
    contents.WriteTo(ost);

    contents.Position = 0;
    byte[] digest = d_sha1Provider.ComputeHash(contents);
    // assert digest.Length == SHA1Length

    ost.Write(digest, 0, digest.Length);
  }

  public void Read(
      out bool correct      // is the file correct in the digest method sense?
    , MemoryStream contents // buffer to read the file contents into
                            // (start at Position = 0; existing buffer
                            // contents will be overwritten)
    , string path           // path to the file to read
  )
  {
    contents.SetLength(0);

    FileStream ist = null;
    try {
      ist = new FileStream(path, FileMode.Open, FileAccess.Read);
    }
    catch (System.IO.FileNotFoundException) {
      correct = false;
      return;
    }

    try {
      Read(out correct, contents, ist);
    } finally {
      ist.Close();
    }
  }

  /*
    Use the digest method to determine whether a given stream is
    "correct" in the digest method sense.  This function reads "ist"
    from its current position to the end and stores the result in
    "contents", starting at position 0 in contents.  The function sets
    "correct" to true iff the stream is correct in the digest sense.
    If the stream is correct in the digest sense, then the contents
    of the stream (not including the digest) are presented to the
    caller in "contents", starting at contents.Position=0 and ending
    at contents.Position=contents.Length-1.
    
    Previously-contained buffer data in "contents", if any, is lost.

    It is up to the caller to close "ist" after calling this function.

    This function will automatically resize the "contents" buffer to
    hold the entire contents of the stream being read.  When reading
    many small files, the caller should attempt to reuse the "contents"
    buffer in order to not burden the memory system.  When reading
    large files, the method of reading the entire file may be undesirable
    because it requires that the entire file be held in memory all
    at once.  Perhaps a future variant of this function will read the
    file in chunks and compute the hash incrementally.

    This function will throw an exception if the file lengths required
    by this function are too large to fit in an int, i.e., the file's
    length is roughly longer than will fit in an int.  The reason for
    this is that the ComputeHash() function doesn't accept a long
    argument or the length.  A future version of this function could
    do digest computations in such a way so as to allow longer lengths
    (see above).

    The function requires that the length of the "ist" stream be known
    in advance.
  */
  public void Read(
      out bool correct        // whether the stream is correct
    , MemoryStream contents   // buffer to read the stream contents into
    , Stream ist              // where to read the stream from
  )
  {
    long tlen1 = ist.Length;
    if (tlen1 < SHA1Length) {
      correct = false;
      return;
    }

    int len1;
    checked {   // generate an OverflowException if tlen1 won't fit in an int
      len1 = (int) tlen1;
    }

    int len0 = len1 - SHA1Length;

    contents.Position = 0;
    contents.SetLength(len1);
    ist.Read(contents.GetBuffer(), 0, len1);

    contents.Position = len0;
    if (d_storedDigest == null) {
      d_storedDigest = new byte[SHA1Length];
    }
    contents.Read(d_storedDigest, 0, SHA1Length);
    contents.Position = 0;
    contents.SetLength(len0);

    // There's no way to get ComputeHash to return a hash into a previously- 
    // existing byte array, so we have to create a new array every time.
    byte[] computedDigest
      = d_sha1Provider.ComputeHash(contents.GetBuffer(), 0, len0);
    correct = equal(computedDigest, d_storedDigest);
  }

  ////////////////////
  //////////////////// private members
  ////////////////////

  const int SHA1Length = 20;    // byte length of an SHA1 digest

  SHA1CryptoServiceProvider d_sha1Provider;
  byte[] d_storedDigest;

  static bool equal(byte[] a, byte[] b)
  {
    if (a.Length != b.Length) {
      return false;
    }

    for (int i = 0; i < a.Length; i++) {
      if (a[i] != b[i])
        return false;
    }

    return true;
  }
}


#if CodeTest
class FileDigesterTest
{
  /*
    With these regression tests, be aware that the tests can be run many
    times from the same directory, so be careful, e.g., don't make a
    test that relies on a particular file not existing before the test
    and then creates the file.
  */
  static void writeTest(string s, FileDigester fd, string fileName)
  {
    string testName = String.Format("WriteTest {0}", fileName);
    ZRiver.CodeTest.Starting(testName);

    MemoryStream ms = new MemoryStream();
    for (int i = 0; i < s.Length; i++) {
      ms.WriteByte((byte) s[i]);
    }

    fd.Write(ms, fileName);

    ZRiver.CodeTest.Ending(testName);
  }

  static void writeTests()
  {
    FileDigester fd = new FileDigester();
    writeTest("", fd, "filedigester_empty.dm");
    writeTest("samiam", fd, "filedigester_samiam.dm");
    writeTest("hello", fd, "filedigester_hello.dm");
  }

  static void readTestInner(FileDigester fd, MemoryStream ms, string fileName)
  {
    bool correct;
    fd.Read(out correct, ms, fileName);
    Console.WriteLine("correct = {0}", correct);

    if (correct) {
      Console.Write("contents = '");
      for (int i = 0; i < ms.Length; i++) {
        Console.Write("{0}", (char) ms.ReadByte());
      }
      Console.WriteLine("'");
    }
  }

  static void readTest1(FileDigester fd, MemoryStream ms, string fileName)
  {
    string testName = String.Format("readTest1 {0}", fileName);
    ZRiver.CodeTest.Starting(testName);

    readTestInner(fd,ms,fileName);

    ZRiver.CodeTest.Ending(testName);
  }

  static void readTest2(int n, FileDigester fd, MemoryStream ms)
  {
    string fileName = String.Format("filedigester_incorrect{0}.dm", n);

    string testName = String.Format("readTest2 {0}", fileName);
    ZRiver.CodeTest.Starting(testName);

    FileStream fs = new FileStream(fileName, FileMode.Create);
    for (int i = 0; i < n; i++) {
      fs.WriteByte((byte) (((int) 'a') + i));
    }
    fs.Close();

    readTestInner(fd,ms,fileName);

    ZRiver.CodeTest.Ending(testName);
  }

  static void readTests()
  {
    FileDigester fd = new FileDigester();
    MemoryStream ms = new MemoryStream();
    readTest1(fd, ms, "filedigester_empty.dm");
    readTest1(fd, ms, "filedigester_samiam.dm");
    readTest1(fd, ms, "filedigester_hello.dm");
    readTest1(fd, ms, "filedigester_doesnotexist.dm");

    readTest2(1, fd, ms);
    readTest2(19, fd, ms);
    readTest2(20, fd, ms);
    readTest2(21, fd, ms);
    readTest2(30, fd, ms);
  }

  static void Test()
  {
    writeTests();
    readTests();
  }

  static void Main()
  {
    ConsoleAppMain.DoMain(new ConsoleAppMain.AppFunc(Test));
  }
}
#endif


} // namespace
Up to Andrew Shapira's home page.
ash at onezero.org