起草一个简单方案,在我的计算机上找到准确的重复档案,但进展缓慢。 我能否加快这一进程?
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Security.Cryptography;
namespace DupeFinder
{
class Program
{
static void Main(string[] args)
{
Console.WriteLine("Getting files...");
var files = Directory.GetFiles(@"D:Photos", "*", SearchOption.AllDirectories);
var alg = new HMACMD5();
var dict = new Dictionary<string,List<string>>();
Console.WriteLine("Computing hashes...");
int i=0;
int cursorLeft = Console.CursorLeft;
int cursorTop = Console.CursorTop;
foreach(var fileName in files)
{
Console.SetCursorPosition(cursorLeft,cursorTop);
Console.Write("Hashing file {0}/{1}", ++i, files.Length);
using(var stream = new BufferedStream(File.OpenRead(fileName),1024*1024*5))
{
var hash = alg.ComputeHash(stream);
var str = BitConverter.ToString(hash);
if (!dict.ContainsKey(str)) dict[str] = new List<string>();
dict[str].Add(fileName);
}
}
Console.WriteLine();
foreach(var dupe in dict.Where(p => p.Value.Count >= 2))
{
Console.WriteLine(string.Join(", ", dupe.Value));
}
Console.WriteLine("Done!");
Console.ReadLine();
}
}
}
可能的优化:
- Avoid converting byte array to string first. I tried this, but it doesn t work, I guess because it s using reference equals rather than comparing the bytes.
- Faster hashing algorithm?
- Different stream, or buffer size? I tried doing 1024^3 which should be a megabyte, but that seemed to slow it down if anything.
或者,这只是一个必然缓慢的事情吗?
我记得,编辑部可以接受<条码>IEqualityComparer,因此,我可以自行撰写<条码>逐[>>>。
我在互联网上发现的许多算法倾向于先比较粗略的长度,我不需要这样做,因为我知道它总是有16个 by。 他们往往在时间......时比较1次,但在64个轨道机器上是1米,因此没有8次?
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Security.Cryptography;
namespace DupeFinder
{
class Program
{
static void Main(string[] args)
{
Console.WriteLine("Getting files...");
string dir = @"D:Photos";
var files = Directory.GetFiles(dir, "*", SearchOption.AllDirectories);
var alg = new HMACMD5();
var dict = new Dictionary<byte[], List<string>>(new Md5Comparer());
Console.WriteLine("Computing hashes...");
int i = 0;
int cursorLeft = Console.CursorLeft;
int cursorTop = Console.CursorTop;
foreach (var fileName in files)
{
Console.SetCursorPosition(cursorLeft, cursorTop);
Console.Write("Hashing file {0}/{1}", ++i, files.Length);
using (var stream = new BufferedStream(File.OpenRead(fileName), 1024 * 1024 * 5))
{
var hash = alg.ComputeHash(stream);
if (!dict.ContainsKey(hash)) dict[hash] = new List<string>();
dict[hash].Add(fileName);
}
}
Console.WriteLine();
using (var sw = new StreamWriter(Path.Combine(dir, "duplicates.txt")))
{
i = 0;
foreach (var dupe in dict.Where(p => p.Value.Count >= 2))
{
sw.WriteLine("Duplicate {0}", ++i);
foreach(var fn in dupe.Value)
{
sw.WriteLine("- {0}", fn);
}
}
}
Console.WriteLine("Done!");
//Console.ReadLine();
}
}
class Md5Comparer : IEqualityComparer<byte[]>
{
public bool Equals(byte[] x, byte[] y)
{
var xi = BitConverter.ToInt64(x, 0);
var yi = BitConverter.ToInt64(y, 0);
if (xi != yi) return false;
xi = BitConverter.ToInt64(x, 8);
yi = BitConverter.ToInt64(y, 8);
return xi == yi;
}
public int GetHashCode(byte[] obj)
{
return obj[0];
}
}
}
不能确定这样做的速度要快得多。 我没有做过任何基准,但肯定会有所放缓。
新法典:
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Text;
using System.Security.Cryptography;
namespace DupeFinder
{
class Program
{
static void Main(string[] args)
{
var watch = Stopwatch.StartNew();
const string dir = @"D:Photos";
var md5Comparer = new Md5Comparer();
var dupeGroups = Directory.EnumerateFiles(dir, "*", SearchOption.AllDirectories)
.Select(fn => new FileInfo(fn))
.GroupBy(fi => fi.Length)
.Where(g => g.Count() > 1)
.SelectMany(g => g
.GroupBy(fi => GetHash(fi.FullName), md5Comparer)
.Where(g2 => g2.Count() > 1));
using (var sw = new StreamWriter(Path.Combine(dir, "duplicates.txt")))
{
int i = 0;
foreach (var dupeGroup in dupeGroups)
{
sw.WriteLine("Duplicate {0}", ++i);
foreach(FileInfo fi in dupeGroup)
{
sw.WriteLine("- {0}", fi.FullName);
}
}
}
Console.WriteLine("{0:0.000} seconds", watch.ElapsedMilliseconds / 1000d); // 22.068 seconds to process 10K files, 37 GB, 463 dupes
Console.ReadLine();
}
static readonly HMACMD5 md5Hasher = new HMACMD5();
public static byte[] GetHash(string fileName)
{
using(var stream = File.OpenRead(fileName))
return md5Hasher.ComputeHash(stream);
}
}
class Md5Comparer : IEqualityComparer<byte[]>
{
public bool Equals(byte[] x, byte[] y)
{
var xi = BitConverter.ToInt64(x, 0);
var yi = BitConverter.ToInt64(y, 0);
if (xi != yi) return false;
xi = BitConverter.ToInt64(x, 8);
yi = BitConverter.ToInt64(y, 8);
return xi == yi;
}
public int GetHashCode(byte[] obj)
{
return obj[0];
}
}
}
从360+秒到22-70秒。 肯定改善!