I want to summarize rather than compress in a similar manner to run length encoding but in a nested sense.
For instance, I want : ABCBCABCBCDEEF to become: (2A(2BC
I'm pretty sure this isn't the best approach, and depending on the length of the patterns, might have a running time and memory usage that won't work, but here's some code.
You can paste the following code into LINQPad and run it, and it should produce the following output:
ABCBCABCBCDEEF = (2A(2BC))D(2E)F ABBABBABBABA = (3A(2B))ABA ABCDABCDCDCDCD = (2ABCD)(3CD)
As you can see, the middle example encoded ABB as A(2B) instead of ABB, you would have to make that judgment yourself, if single-symbol sequences like that should be encoded as a repeated symbol or not, or if a specific threshold (like 3 or more) should be used.
Basically, the code runs like this:
Anyway, here's the code:
void Main()
{
string[] examples = new[]
{
"ABCBCABCBCDEEF",
"ABBABBABBABA",
"ABCDABCDCDCDCD",
};
foreach (string example in examples)
{
StringBuilder sb = new StringBuilder();
foreach (var r in Encode(example))
sb.Append(r.ToString());
Debug.WriteLine(example + " = " + sb.ToString());
}
}
public static IEnumerable> Encode(IEnumerable values)
{
return Encode(values, EqualityComparer.Default);
}
public static IEnumerable> Encode(IEnumerable values, IEqualityComparer comparer)
{
List sequence = new List(values);
int index = 0;
while (index < sequence.Count)
{
var bestSequence = FindBestSequence(sequence, index, comparer);
if (bestSequence == null || bestSequence.Length < 1)
throw new InvalidOperationException("Unable to find sequence at position " + index);
yield return bestSequence;
index += bestSequence.Length;
}
}
private static Repeat FindBestSequence(IList sequence, int startIndex, IEqualityComparer comparer)
{
int sequenceLength = 1;
while (startIndex + sequenceLength * 2 <= sequence.Count)
{
if (comparer.Equals(sequence[startIndex], sequence[startIndex + sequenceLength]))
{
bool atLeast2Repeats = true;
for (int index = 0; index < sequenceLength; index++)
{
if (!comparer.Equals(sequence[startIndex + index], sequence[startIndex + sequenceLength + index]))
{
atLeast2Repeats = false;
break;
}
}
if (atLeast2Repeats)
{
int count = 2;
while (startIndex + sequenceLength * (count + 1) <= sequence.Count)
{
bool anotherRepeat = true;
for (int index = 0; index < sequenceLength; index++)
{
if (!comparer.Equals(sequence[startIndex + index], sequence[startIndex + sequenceLength * count + index]))
{
anotherRepeat = false;
break;
}
}
if (anotherRepeat)
count++;
else
break;
}
List oneSequence = Enumerable.Range(0, sequenceLength).Select(i => sequence[startIndex + i]).ToList();
var repeatedSequence = Encode(oneSequence, comparer).ToArray();
return new SequenceRepeat(count, repeatedSequence);
}
}
sequenceLength++;
}
// fall back, we could not find anything that repeated at all
return new SingleSymbol(sequence[startIndex]);
}
public abstract class Repeat
{
public int Count { get; private set; }
protected Repeat(int count)
{
Count = count;
}
public abstract int Length
{
get;
}
}
public class SingleSymbol : Repeat
{
public T Value { get; private set; }
public SingleSymbol(T value)
: base(1)
{
Value = value;
}
public override string ToString()
{
return string.Format("{0}", Value);
}
public override int Length
{
get
{
return Count;
}
}
}
public class SequenceRepeat : Repeat
{
public Repeat[] Values { get; private set; }
public SequenceRepeat(int count, Repeat[] values)
: base(count)
{
Values = values;
}
public override string ToString()
{
return string.Format("({0}{1})", Count, string.Join("", Values.Select(v => v.ToString())));
}
public override int Length
{
get
{
int oneLength = 0;
foreach (var value in Values)
oneLength += value.Length;
return Count * oneLength;
}
}
}
public class GroupRepeat : Repeat
{
public Repeat Group { get; private set; }
public GroupRepeat(int count, Repeat group)
: base(count)
{
Group = group;
}
public override string ToString()
{
return string.Format("({0}{1})", Count, Group);
}
public override int Length
{
get
{
return Count * Group.Length;
}
}
}