How to remove illegal characters from path and filenames?

前端 未结 29 3366
离开以前
离开以前 2020-11-22 17:18

I need a robust and simple way to remove illegal path and file characters from a simple string. I\'ve used the below code but it doesn\'t seem to do anything, what am I miss

29条回答
  •  独厮守ぢ
    2020-11-22 17:25

    Here is my small contribution. A method to replace within the same string without creating new strings or stringbuilders. It's fast, easy to understand and a good alternative to all mentions in this post.

    private static HashSet _invalidCharsHash;
    private static HashSet InvalidCharsHash
    {
      get { return _invalidCharsHash ?? (_invalidCharsHash = new HashSet(Path.GetInvalidFileNameChars())); }
    }
    
    private static string ReplaceInvalidChars(string fileName, string newValue)
    {
      char newChar = newValue[0];
    
      char[] chars = fileName.ToCharArray();
      for (int i = 0; i < chars.Length; i++)
      {
        char c = chars[i];
        if (InvalidCharsHash.Contains(c))
          chars[i] = newChar;
      }
    
      return new string(chars);
    }
    

    You can call it like this:

    string illegal = "\"M<>\"\\a/ry/ h**ad:>> a\\/:*?\"<>| li*tt|le|| la\"mb.?";
    string legal = ReplaceInvalidChars(illegal);
    

    and returns:

    _M ____a_ry_ h__ad___ a_________ li_tt_le__ la_mb._
    

    It's worth to note that this method will always replace invalid chars with a given value, but will not remove them. If you want to remove invalid chars, this alternative will do the trick:

    private static string RemoveInvalidChars(string fileName, string newValue)
    {
      char newChar = string.IsNullOrEmpty(newValue) ? char.MinValue : newValue[0];
      bool remove = newChar == char.MinValue;
    
      char[] chars = fileName.ToCharArray();
      char[] newChars = new char[chars.Length];
      int i2 = 0;
      for (int i = 0; i < chars.Length; i++)
      {
        char c = chars[i];
        if (InvalidCharsHash.Contains(c))
        {
          if (!remove)
            newChars[i2++] = newChar;
        }
        else
          newChars[i2++] = c;
    
      }
    
      return new string(newChars, 0, i2);
    }
    

    BENCHMARK

    I executed timed test runs with most methods found in this post, if performance is what you are after. Some of these methods don't replace with a given char, since OP was asking to clean the string. I added tests replacing with a given char, and some others replacing with an empty char if your intended scenario only needs to remove the unwanted chars. Code used for this benchmark is at the end, so you can run your own tests.

    Note: Methods Test1 and Test2 are both proposed in this post.

    First Run

    replacing with '_', 1000000 iterations

    Results:

    ============Test1===============
    Elapsed=00:00:01.6665595
    Result=_M ____a_ry_ h__ad___ a_________ li_tt_le__ la_mb._
    
    ============Test2===============
    Elapsed=00:00:01.7526835
    Result=_M ____a_ry_ h__ad___ a_________ li_tt_le__ la_mb._
    
    ============Test3===============
    Elapsed=00:00:05.2306227
    Result=_M ____a_ry_ h__ad___ a_________ li_tt_le__ la_mb._
    
    ============Test4===============
    Elapsed=00:00:14.8203696
    Result=_M ____a_ry_ h__ad___ a_________ li_tt_le__ la_mb._
    
    ============Test5===============
    Elapsed=00:00:01.8273760
    Result=_M ____a_ry_ h__ad___ a_________ li_tt_le__ la_mb._
    
    ============Test6===============
    Elapsed=00:00:05.4249985
    Result=_M ____a_ry_ h__ad___ a_________ li_tt_le__ la_mb._
    
    ============Test7===============
    Elapsed=00:00:07.5653833
    Result=_M ____a_ry_ h__ad___ a_________ li_tt_le__ la_mb._
    
    ============Test8===============
    Elapsed=00:12:23.1410106
    Result=_M ____a_ry_ h__ad___ a_________ li_tt_le__ la_mb._
    
    ============Test9===============
    Elapsed=00:00:02.1016708
    Result=_M ____a_ry_ h__ad___ a_________ li_tt_le__ la_mb._
    
    ============Test10===============
    Elapsed=00:00:05.0987225
    Result=M ary had a little lamb.
    
    ============Test11===============
    Elapsed=00:00:06.8004289
    Result=M ary had a little lamb.
    

    Second Run

    removing invalid chars, 1000000 iterations

    Note: Test1 will not remove, only replace.

    Results:

    ============Test1===============
    Elapsed=00:00:01.6945352
    Result= M     a ry  h  ad    a          li tt le   la mb.
    
    ============Test2===============
    Elapsed=00:00:01.4798049
    Result=M ary had a little lamb.
    
    ============Test3===============
    Elapsed=00:00:04.0415688
    Result=M ary had a little lamb.
    
    ============Test4===============
    Elapsed=00:00:14.3397960
    Result=M ary had a little lamb.
    
    ============Test5===============
    Elapsed=00:00:01.6782505
    Result=M ary had a little lamb.
    
    ============Test6===============
    Elapsed=00:00:04.9251707
    Result=M ary had a little lamb.
    
    ============Test7===============
    Elapsed=00:00:07.9562379
    Result=M ary had a little lamb.
    
    ============Test8===============
    Elapsed=00:12:16.2918943
    Result=M ary had a little lamb.
    
    ============Test9===============
    Elapsed=00:00:02.0770277
    Result=M ary had a little lamb.
    
    ============Test10===============
    Elapsed=00:00:05.2721232
    Result=M ary had a little lamb.
    
    ============Test11===============
    Elapsed=00:00:05.2802903
    Result=M ary had a little lamb.
    

    BENCHMARK RESULTS

    Methods Test1, Test2 and Test5 are the fastest. Method Test8 is the slowest.

    CODE

    Here's the complete code of the benchmark:

    private static HashSet _invalidCharsHash;
    private static HashSet InvalidCharsHash
    {
      get { return _invalidCharsHash ?? (_invalidCharsHash = new HashSet(Path.GetInvalidFileNameChars())); }
    }
    
    private static string _invalidCharsValue;
    private static string InvalidCharsValue
    {
      get { return _invalidCharsValue ?? (_invalidCharsValue = new string(Path.GetInvalidFileNameChars())); }
    }
    
    private static char[] _invalidChars;
    private static char[] InvalidChars
    {
      get { return _invalidChars ?? (_invalidChars = Path.GetInvalidFileNameChars()); }
    }
    
    static void Main(string[] args)
    {
      string testPath = "\"M <>\"\\a/ry/ h**ad:>> a\\/:*?\"<>| li*tt|le|| la\"mb.?";
    
      int max = 1000000;
      string newValue = "";
    
      TimeBenchmark(max, Test1, testPath, newValue);
      TimeBenchmark(max, Test2, testPath, newValue);
      TimeBenchmark(max, Test3, testPath, newValue);
      TimeBenchmark(max, Test4, testPath, newValue);
      TimeBenchmark(max, Test5, testPath, newValue);
      TimeBenchmark(max, Test6, testPath, newValue);
      TimeBenchmark(max, Test7, testPath, newValue);
      TimeBenchmark(max, Test8, testPath, newValue);
      TimeBenchmark(max, Test9, testPath, newValue);
      TimeBenchmark(max, Test10, testPath, newValue);
      TimeBenchmark(max, Test11, testPath, newValue);
    
      Console.Read();
    }
    
    private static void TimeBenchmark(int maxLoop, Func func, string testString, string newValue)
    {
      var sw = new Stopwatch();
      sw.Start();
      string result = string.Empty;
    
      for (int i = 0; i < maxLoop; i++)
        result = func?.Invoke(testString, newValue);
    
      sw.Stop();
    
      Console.WriteLine($"============{func.Method.Name}===============");
      Console.WriteLine("Elapsed={0}", sw.Elapsed);
      Console.WriteLine("Result={0}", result);
      Console.WriteLine("");
    }
    
    private static string Test1(string fileName, string newValue)
    {
      char newChar = string.IsNullOrEmpty(newValue) ? char.MinValue : newValue[0];
    
      char[] chars = fileName.ToCharArray();
      for (int i = 0; i < chars.Length; i++)
      {
        if (InvalidCharsHash.Contains(chars[i]))
          chars[i] = newChar;
      }
    
      return new string(chars);
    }
    
    private static string Test2(string fileName, string newValue)
    {
      char newChar = string.IsNullOrEmpty(newValue) ? char.MinValue : newValue[0];
      bool remove = newChar == char.MinValue;
    
      char[] chars = fileName.ToCharArray();
      char[] newChars = new char[chars.Length];
      int i2 = 0;
      for (int i = 0; i < chars.Length; i++)
      {
        char c = chars[i];
        if (InvalidCharsHash.Contains(c))
        {
          if (!remove)
            newChars[i2++] = newChar;
        }
        else
          newChars[i2++] = c;
    
      }
    
      return new string(newChars, 0, i2);
    }
    
    private static string Test3(string filename, string newValue)
    {
      foreach (char c in InvalidCharsValue)
      {
        filename = filename.Replace(c.ToString(), newValue);
      }
    
      return filename;
    }
    
    private static string Test4(string filename, string newValue)
    {
      Regex r = new Regex(string.Format("[{0}]", Regex.Escape(InvalidCharsValue)));
      filename = r.Replace(filename, newValue);
      return filename;
    }
    
    private static string Test5(string filename, string newValue)
    {
      return string.Join(newValue, filename.Split(InvalidChars));
    }
    
    private static string Test6(string fileName, string newValue)
    {
      return InvalidChars.Aggregate(fileName, (current, c) => current.Replace(c.ToString(), newValue));
    }
    
    private static string Test7(string fileName, string newValue)
    {
      string regex = string.Format("[{0}]", Regex.Escape(InvalidCharsValue));
      return Regex.Replace(fileName, regex, newValue, RegexOptions.Compiled);
    }
    
    private static string Test8(string fileName, string newValue)
    {
      string regex = string.Format("[{0}]", Regex.Escape(InvalidCharsValue));
      Regex removeInvalidChars = new Regex(regex, RegexOptions.Singleline | RegexOptions.Compiled | RegexOptions.CultureInvariant);
      return removeInvalidChars.Replace(fileName, newValue);
    }
    
    private static string Test9(string fileName, string newValue)
    {
      StringBuilder sb = new StringBuilder(fileName.Length);
      bool changed = false;
    
      for (int i = 0; i < fileName.Length; i++)
      {
        char c = fileName[i];
        if (InvalidCharsHash.Contains(c))
        {
          changed = true;
          sb.Append(newValue);
        }
        else
          sb.Append(c);
      }
    
      if (sb.Length == 0)
        return newValue;
    
      return changed ? sb.ToString() : fileName;
    }
    
    private static string Test10(string fileName, string newValue)
    {
      if (!fileName.Any(c => InvalidChars.Contains(c)))
      {
        return fileName;
      }
    
      return new string(fileName.Where(c => !InvalidChars.Contains(c)).ToArray());
    }
    
    private static string Test11(string fileName, string newValue)
    {
      string invalidCharsRemoved = new string(fileName
        .Where(x => !InvalidChars.Contains(x))
        .ToArray());
    
      return invalidCharsRemoved;
    }
    

提交回复
热议问题