How to calculate frequency of elements for pairwise comparisons of lists in Python?

前端 未结 1 716
甜味超标
甜味超标 2020-12-12 02:33

I have the the sample stored in the following list

 sample = [AAAA,CGCG,TTTT,AT-T,CATC]

.. To illustrate the problem, I have denoted them a

相关标签:
1条回答
  • I think this is what you want:

    from collections import Counter
    
    # Remove elements where all nucleobases are the same.
    for index in range(len(sample) - 1, -1, -1):
        if sample[index][:1] * len(sample[index]) == sample[index]:
            del sample[index]
    
    for indexA, setA in enumerate(sample):
        for indexB, setB in enumerate(sample):
            # Don't compare samples with themselves nor compare same pair twice.
            if indexA <= indexB:
                continue
    
            # Calculate number of unique pairs
            pair_count = Counter()
            for pair in zip(setA, setB):
                if '-' not in pair:
                    pair_count[pair] += 1
    
            # Only analyse pairs of sets with 2 unique pairs.
            if len(pair_count) != 2:
                continue
    
            # Count individual bases.
            base_counter = Counter()
            for pair, count in pair_count.items():
                base_counter[pair[0]] += count
                base_counter[pair[1]] += count
    
            # Get the length of one of each item in the pair.
            sequence_length = sum(pair_count.values())
    
            # Convert counts to frequencies.
            base_freq = {}
            for base, count in base_counter.items():
                base_freq[base] = count / float(sequence_length)
    
            # Examine a pair from the two unique pairs to calculate float_a.
            pair = list(pair_count)[0]
            float_a = (pair_count[pair] / float(sequence_length)) - base_freq[pair[0]] * base_freq[pair[1]]
    
            # Step 7!
            float_b = float_a / float(base_freq.get('A', 0) * base_freq.get('T', 0) * base_freq.get('C', 0) * base_freq.get('G', 0))
    

    Or, more Pythonically (with the list/dict comprehensions you don't want):

    from collections import Counter
    
    BASES = 'ATCG'
    
    # Remove elements where all nucleobases are the same.
    sample = [item for item in sample if item[:1] * len(item) != item]
    
    for indexA, setA in enumerate(sample):
        for indexB, setB in enumerate(sample):
            # Don't compare samples with themselves nor compare same pair twice.
            if indexA <= indexB:
                continue
    
            # Calculate number of unique pairs
            relevant_pairs = [(elA, elB) for (elA, elB) in zip(setA, setB) if elA != '-' and elB != '-']
            pair_count = Counter(relevant_pairs)
    
            # Only analyse pairs of sets with 2 unique pairs.
            if len(pair_count) != 2:
                continue
    
            # setA and setB as tuples with pairs involving '-' removed.
            setA, setB = zip(*relevant_pairs)
    
            # Get the total for each base.
            seq_length = len(setA)
    
            # Convert counts to frequencies.
            base_freq = {base : count / float(seq_length) for (base, count) in (Counter(setA) + Counter(setB)).items()}
    
            # Examine a pair from the two unique pairs to calculate float_a.
            pair = list(pair_count)[0]
            float_a = (pair_count[pair] / float(seq_length)) - base_freq[pair[0]] * base_freq[pair[1]]
    
            # Step 7!
            denominator = 1
            for base in BASES:
                denominator *= base_freq.get(base, 0)
    
            float_b = float_a / denominator
    
    0 讨论(0)
提交回复
热议问题