Similarity can be found easily without classification. Try this O(n2) but works fine.
def jaccard_similarity(doc1, doc2):
a = sets(doc1.split())
b = sets(doc2.split())
similarity = float(len(a.intersection(b))*1.0/len(a.union(b))) #similarity belongs to [0,1] 1 means its exact replica.
return similarity