Split large text string into variable length strings without breaking words and keeping linebreaks and spaces

折月煮酒 提交于 2019-11-29 15:50:00
public List<String> SplitString(String text, int [] lengths)
{
   List<String> output = new List<String>();

   List<String> words = Split(text);

   int i = 0;
   int lineNum = 0;
   string s = string.empty;
   while(i<words.Length)
   {
       if(s.Length+words[i].Length <lengths[lineNum])
       {
            s+=words[i];
            i++;
            if(lineNum<lengths.Length-1)
                 lineNum++;
       }
       else
       {
          output.Add(s);
          s=String.Empty;
       }

   }

    s.Remove(S.length-1,1);// deletes last extra space.

    return output;
}


   public static List<string> Split(string text)
    {
        List<string> result = new List<string>();
        StringBuilder sb = new StringBuilder();

        foreach (var letter in text)
        {
            if (letter != ' ' && letter != '\t' && letter != '\n')
            {
                sb.Append(letter);
            }
            else
            {
                if (sb.Length > 0)
                {

                    result.Add(sb.ToString());
                }

                result.Add(letter.ToString());
                sb = new StringBuilder();
            }
        }

        return result;
    }

This is untested/compiled code, but you should get the idea.

I also think you should use a StringBuilder instead, but I didn't remember how to use it.

Tim Pietzcker
\A(.{0,5}\b)(.{0,11}\b)(.{0,20}\b)+\Z

will capture up to five characters in group 1, up to 11 in group 2 and chunks of up to 20 in group 3. Matches will be split along word delimiters in order to avoid splitting in the middle of a word. Whitespace, line break etc. count as characters and will be preserved.

The trick is to get at the individual matches in the repeated group, something that can only be done in .NET and Perl 6:

Match matchResults = null;
Regex paragraphs = new Regex(@"\A(.{0,5}\b)(.{0,11}\b)(.{0,20}\b)+\Z", RegexOptions.Singleline);
matchResults = paragraphs.Match(subjectString);
if (matchResults.Success) {
    String line1 = matchResults.Groups[1].Value;
    String line2 = matchResults.Groups[2].Value;
    Capture line3andup = matchResults.Groups[3].Captures;
    // you now need to iterate over line3andup, extracting the lines.
} else {
    // Match attempt failed
} 

I don't know C# at all and have tried to construct this from RegexBuddy's templates and the VB code here, so please feel free to point out my coding errors.

Note that the whitespace at the beginning of line two is captured at the end of the previous match.

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!