Most efficient way to find if a string is mixedCase

问题

Suppose I have very long strings and I want to see if a column is allLower, allUpper, or mixedCase. For example with the following column

text
"hello"
"New"
"items"
"iTem12"
"-3nXy"

The text would be mixedCase. A naive algorithm to determine this might be:

int is_mixed_case, is_all_lower, is_all_upper;
int has_lower = 0;
int has_upper = 0;
// for each row...for each column...
for (int i = 0; (c=s[i]) != '\0'; i++) {
    if (c >='a' && c <= 'z') {
        has_lower = 1;
        if (has_upper) break;
    }
    else if (c >='A' && c <= 'Z') {
        has_upper = 1;
        if (has_lower) break;
    }
}

is_all_lower = has_lower && !has_upper;
is_all_upper = has_upper && !has_lower;
is_mixed_case = has_lower && has_upper;

I'm sure there would be a more performant way to do this, however. What might be the most efficient way to do this algorithm/calculation?

回答1:

If you know the character encoding that's going to be used (I've used ISO/IEC 8859-15 in the code example), a look-up table may be the fastest solution. This also allows you to decide which characters from the extended character set, such as µ or ß, you'll count as upper case, lower case or non-alphabetical.

char test_case(const char *s) {
    static const char alphabet[] = {
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  //  ABCDEFGHIJKLMNO
        1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,  // PQRSTUVWXYZ
        0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  //  abcdefghijklmno
        2,2,2,2,2,2,2,2,2,2,2,0,0,0,0,0,  // pqrstuvwxyz
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,1,0,2,0,2,0,0,0,0,  //        Š š ª
        0,0,0,0,0,1,2,0,0,2,0,2,0,1,2,1,  //      Žµ  ž º ŒœŸ
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  // ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏ
        1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,  // ÐÑÒÓÔÕÖ ØÙÚÛÜÝÞß
        2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  // àáâãäåæçèéêëìíîï
        2,2,2,2,2,2,2,0,2,2,2,2,2,2,2,2}; // ðñòóôõö øùúûüýþÿ
    char cases = 0;
    while (*s && cases != 3) {
        cases |= alphabet[(unsigned char) *s++];
    }
    return cases; // 0 = none, 1 = upper, 2 = lower, 3 = mixed
}

As suggested in a comment by chux, you can set the value of alphabet[0] to 4, and then you need only one condition cases < 3 in the while loop.

回答2:

This should be fairly efficient - it checks the minimum number of characters necessary. This assumes a bias towards lower-case characters, so checking for lower-case first should be slightly more efficient:

#include <ctype.h>

int ismixed( const unsigned char *str )
{
    int hasUpper = 0;
    int hasLower = 0;

    while ( *str )
    {
        // can't be both upper and lower case
        // but it can be neither
        if ( islower( *str ) )
        {
            hasLower = 1;
        }
        else if ( isupper( *str ) )
        {
            hasUpper = 1;
        }

        // return true as soon as we hit
        // both upper and lower case
        if ( hasLower && hasUpper )
        {
            return( 1 );
        }

        str++;
    }

    return( 0 );
}

Depending on whether your input is biased to lower or upper case, checking isupper() first might be better.

回答3:

If we assume ASCII

If we assume all alpha,

Then code only needs to count the "case" bits. Is the sum 0, same as string length or otherwise?

void test_case(const char *s) {
  const char *start = s;
  size_t sum = 0;
  size_t mask = 'A' ^ 'a';
  while (*s) {
    sum += *s++ & mask;
  }
  ptrdiff_t len = s - start;
  sum /= mask;
  if (len == 0) puts("Empty string");
  else if (sum == 0) puts("All UC");   
  else if (sum == len) puts("All LC");
  else puts("Mixed");
}

Note: with slight mods, will work for EBCIDIC too.

回答4:

Is said string guaranteed to only contain letters? If so, could check to see if any two consecutive characters are different cases.

#include <ctype.h>
#include <errno.h>
int mixed_case(const char *str) {
   if(!str){
      // sanity check
      errno = EINVAL;
      return -1;
   }

   // can't be mixed-case without more than one letter
   if(str[0] == '\0' || str[1] == '\0'){
      return 0;
   }

   for(int i = 1; str[i] != '\0' ; ++i) {
      if (!islower(str[i]) ^ !islower(str[i-1])) {
         // if two letter next to each other are not the same case, it's mixed case
         return 1;
      }
   }
   // didn't find any mismatches, so not mixed case
   return 0;
}

Taking a similar approach, but instead of checking consecutive characters, it will find the first alphabetical character and check it against any other alphabetical characters found. This should be able to handle strings with non-alphabetical characters.

int mixed_case(const char *str) {
   if(!str){
      // sanity check
      errno = EINVAL;
      return -1;
   }

   // can't be mixed-case without more than one letter
   if(str[0] == '\0' || str[1] == '\0'){
      return 0;
   }

   // find the first alphabetical character and store its index at 'i'
   int i = 0;
   for(;!isalpha(str[i]) || str[i] == '\0'; ++i);

   if(str[i] == '\0') {
      // no alphabetical characters means you can't have mixed cases
      return 0;
   }

   // See if any of the other alphabetical characters differ from the case of the first one
   for(int j = i+1; str[j] != '\0' ; ++j) {
      if(isalpha(str[j]) && (!islower(str[i]) ^ !islower(str[j]))) {
         return 1;
      }
   }
   // didn't find any mismatches, so not mixed case
   return 0;
}

回答5:

Another approach that does not assume ASCII nor all alpha.

Assess the first char and then perform one of 2 optimized loops.

This quits the loops on the first mis-match. Since the while() loops are only doing a single test, this leads to optimal performance.

#include <ctype.h>

void case_test(const char *s) {
  if (*s == '\0') {
    puts("Empty string");
    return;
  }

  unsigned char *us = (unsigned char *)s; // use unsigned char with is***() functions.
  if (islower(*us)) {
    while (islower(*us)) {
      us++;
    }
    if (*us) {
      puts("Mixed or not alpha");
    } else {
      puts("All lower");
    }
  } else if (isupper(*us)) {
    while (isupper(*us)) {
      us++;
    }
    if (*us) {
      puts("Mixed case or not alpha");
    } else {
      puts("All upper");
    }
  } else {
    puts("Not alpha");
  }
}

OP added cases including non-alpha. The below promptly handles that.

void case_test_with_non_letters(const char *s) {
  unsigned char *us = (unsigned char *)s; // use unsigned char with is***() functions.

  //  Find first alpha or null character
  while (!isalpha(*us) && *us) {
    us++;
  }

  if (*us == '\0') {
    puts("Empty string");
    return;
  }

  if (islower(*us)) {
    while (!isupper(*us) && *us) {
      us++;
    }
    if (isupper(*us)) {
      puts("Mixed");
    } else {
      puts("All letters lower");
    }
  } else if (isupper(*us)) {
    while (!islower(*us) && *us) {
      us++;
    }
    if (*us) {
      puts("Mixed case");
    } else {
      puts("All letters upper");
    }
  } else {
    puts("Not alpha");
  }
}

回答6:

97 = a = 1100001 65 = A = 0100001

You have just to test the bit number 6.

来源：https://stackoverflow.com/questions/57682576/most-efficient-way-to-find-if-a-string-is-mixedcase

标签

string

algorithm

optimization