Does anyone have a good Proper Case algorithm

前端 未结 13 1234
一生所求
一生所求 2020-12-15 04:03

Does anyone have a trusted Proper Case or PCase algorithm (similar to a UCase or Upper)? I\'m looking for something that takes a value such as \"GEORGE BURDELL\"

13条回答
  •  青春惊慌失措
    2020-12-15 05:07

    I did a quick C# port of https://github.com/tamtamchik/namecase, which is based on Lingua::EN::NameCase.

    public static class CIQNameCase
    {
        static Dictionary _exceptions = new Dictionary
            {
                {@"\bMacEdo"     ,"Macedo"},
                {@"\bMacEvicius" ,"Macevicius"},
                {@"\bMacHado"    ,"Machado"},
                {@"\bMacHar"     ,"Machar"},
                {@"\bMacHin"     ,"Machin"},
                {@"\bMacHlin"    ,"Machlin"},
                {@"\bMacIas"     ,"Macias"},
                {@"\bMacIulis"   ,"Maciulis"},
                {@"\bMacKie"     ,"Mackie"},
                {@"\bMacKle"     ,"Mackle"},
                {@"\bMacKlin"    ,"Macklin"},
                {@"\bMacKmin"    ,"Mackmin"},
                {@"\bMacQuarie"  ,"Macquarie"}
            };
    
        static Dictionary _replacements = new Dictionary
            {
                {@"\bAl(?=\s+\w)"         , @"al"},        // al Arabic or forename Al.
                {@"\b(Bin|Binti|Binte)\b" , @"bin"},       // bin, binti, binte Arabic
                {@"\bAp\b"                , @"ap"},        // ap Welsh.
                {@"\bBen(?=\s+\w)"        , @"ben"},       // ben Hebrew or forename Ben.
                {@"\bDell([ae])\b"        , @"dell$1"},    // della and delle Italian.
                {@"\bD([aeiou])\b"        , @"d$1"},       // da, de, di Italian; du French; do Brasil
                {@"\bD([ao]s)\b"          , @"d$1"},       // das, dos Brasileiros
                {@"\bDe([lrn])\b"         , @"de$1"},      // del Italian; der/den Dutch/Flemish.
                {@"\bEl\b"                , @"el"},        // el Greek or El Spanish.
                {@"\bLa\b"                , @"la"},        // la French or La Spanish.
                {@"\bL([eo])\b"           , @"l$1"},       // lo Italian; le French.
                {@"\bVan(?=\s+\w)"        , @"van"},       // van German or forename Van.
                {@"\bVon\b"               , @"von"}        // von Dutch/Flemish
            };
    
        static string[] _conjunctions = { "Y", "E", "I" };
    
        static string _romanRegex = @"\b((?:[Xx]{1,3}|[Xx][Ll]|[Ll][Xx]{0,3})?(?:[Ii]{1,3}|[Ii][VvXx]|[Vv][Ii]{0,3})?)\b";
    
        /// 
        /// Case a name field into its appropriate case format 
        /// e.g. Smith, de la Cruz, Mary-Jane,  O'Brien, McTaggart
        /// 
        /// 
        /// 
        public static string NameCase(string nameString)
        {
            // Capitalize
            nameString = Capitalize(nameString);
            nameString = UpdateIrish(nameString);
    
            // Fixes for "son (daughter) of" etc
            foreach (var replacement in _replacements.Keys)
            {
                if (Regex.IsMatch(nameString, replacement))
                {
                    Regex rgx = new Regex(replacement);
                    nameString = rgx.Replace(nameString, _replacements[replacement]);
                }                    
            }
    
            nameString = UpdateRoman(nameString);
            nameString = FixConjunction(nameString);
    
            return nameString;
        }
    
        /// 
        /// Capitalize first letters.
        /// 
        /// 
        /// 
        private static string Capitalize(string nameString)
        {
            nameString = nameString.ToLower();
            nameString = Regex.Replace(nameString, @"\b\w", x => x.ToString().ToUpper());
            nameString = Regex.Replace(nameString, @"'\w\b", x => x.ToString().ToLower()); // Lowercase 's
            return nameString;
        }
    
        /// 
        /// Update for Irish names.
        /// 
        /// 
        /// 
        private static string UpdateIrish(string nameString)
        {
            if(Regex.IsMatch(nameString, @".*?\bMac[A-Za-z^aciozj]{2,}\b") || Regex.IsMatch(nameString, @".*?\bMc"))
            {
                nameString = UpdateMac(nameString);
            }            
            return nameString;
        }
    
        /// 
        /// Updates irish Mac & Mc.
        /// 
        /// 
        /// 
        private static string UpdateMac(string nameString)
        {
            MatchCollection matches = Regex.Matches(nameString, @"\b(Ma?c)([A-Za-z]+)");
            if(matches.Count == 1 && matches[0].Groups.Count == 3)
            {
                string replacement = matches[0].Groups[1].Value;
                replacement += matches[0].Groups[2].Value.Substring(0, 1).ToUpper();
                replacement += matches[0].Groups[2].Value.Substring(1);
                nameString = nameString.Replace(matches[0].Groups[0].Value, replacement);
    
                // Now fix "Mac" exceptions
                foreach (var exception in _exceptions.Keys)
                {
                    nameString = Regex.Replace(nameString, exception, _exceptions[exception]);
                }
            }
            return nameString;
        }
    
        /// 
        /// Fix roman numeral names.
        /// 
        /// 
        /// 
        private static string UpdateRoman(string nameString)
        {
            MatchCollection matches = Regex.Matches(nameString, _romanRegex);
            if (matches.Count > 1)
            {
                foreach(Match match in matches)
                {
                    if(!string.IsNullOrEmpty(match.Value))
                    {
                        nameString = Regex.Replace(nameString, match.Value, x => x.ToString().ToUpper());
                    }
                }
            }
            return nameString;
        }
    
        /// 
        /// Fix Spanish conjunctions.
        /// 
        /// 
        /// 
        private static string FixConjunction(string nameString)
        {            
            foreach (var conjunction in _conjunctions)
            {
                nameString = Regex.Replace(nameString, @"\b" + conjunction + @"\b", x => x.ToString().ToLower());
            }
            return nameString;
        }
    }
    

    Usage

    string name_cased = CIQNameCase.NameCase("McCarthy");
    

    This is my test method, everything seems to pass OK:

    [TestMethod]
    public void Test_NameCase_1()
    {
        string[] names = {
            "Keith", "Yuri's", "Leigh-Williams", "McCarthy",
            // Mac exceptions
            "Machin", "Machlin", "Machar",
            "Mackle", "Macklin", "Mackie",
            "Macquarie", "Machado", "Macevicius",
            "Maciulis", "Macias", "MacMurdo",
            // General
            "O'Callaghan", "St. John", "von Streit",
            "van Dyke", "Van", "ap Llwyd Dafydd",
            "al Fahd", "Al",
            "el Grecco",
            "ben Gurion", "Ben",
            "da Vinci",
            "di Caprio", "du Pont", "de Legate",
            "del Crond", "der Sind", "van der Post", "van den Thillart",
            "von Trapp", "la Poisson", "le Figaro",
            "Mack Knife", "Dougal MacDonald",
            "Ruiz y Picasso", "Dato e Iradier", "Mas i Gavarró",
            // Roman numerals
            "Henry VIII", "Louis III", "Louis XIV",
            "Charles II", "Fred XLIX", "Yusof bin Ishak",
        };
    
        foreach(string name in names)
        {
            string name_upper = name.ToUpper();
            string name_cased = CIQNameCase.NameCase(name_upper);
            Console.WriteLine(string.Format("name: {0} -> {1}  -> {2}", name, name_upper, name_cased));
            Assert.IsTrue(name == name_cased);
        }
    
    }
    

提交回复
热议问题