count occurrences of a string pattern in a file and count

问题

Team,

I am trying to count two patterns in a file and list them as

pattern1: 2
pattern2: 3

#!/usr/bin/python
import os
import re

d = dict()
with open('/home/user/waste/nodes-prod.log', 'r') as file:
    for line in file:
        line = line.strip()
        for word in line.split():
            node1 = re.match(r"team1.*", word)
            type(node1)
            node2 = re.match(r"team2.*", word)
            type(node2)
            if node1 in d:
                d[node1] = d[node1] + 1
            else:
                d[node2] = d[node2] + 1
for key in list(d.keys()):
    print(key, ":", d[key])

my /home/user/waste/nodes-prod.log is below

cat /home/user/waste/nodes-prod.log
team1-develop
team1-work
team2-research1
team2-research2
team2-research3

output

Traceback (most recent call last):
  File "read-and-count-words-pattern-fromfile-using-dict-in-python.py", line 17, in <module>
    d[node2] = d[node2] + 1
KeyError: <_sre.SRE_Match object; span=(0, 10), match='team2-research1'>

expected:

node1: 2
node2: 3

回答1:

It is easier if you read the entire text into memory (if that is not burdensome given the size of the file):

import re 

with open(fn) as f:
    txt=f.read()
    
print(f'node 1: {len(re.findall(r"team1.*", txt))}')    
print(f'node 2: {len(re.findall(r"team2.*", txt))}')

Prints:

node 1: 2
node 2: 3

If you do want to do line-by-line, you can just keep a counter:

import re 

node1,node2 =(0,0)
with open(fn) as f:
    for line in f:
        if re.search(r"team1.*", line): node1+=1 
        if re.search(r"team2.*", line): node2+=1 
    
print(f'node 1: {node1}')   
print(f'node 2: {node2}')

Better still, you could use a dict to map any `"team\d" to a mapping of that variable number:

nodes={}
with open(fn) as f:
    for line in f:
        if m:=re.search(r"team(\d+).*", line): 
            nodes[m.group(1)]=nodes.get(m.group(1),0)+1

>>> nodes
{'1': 2, '2': 3}

回答2:

#!/usr/bin/python
import os
import re

# dict is the dictionary,
# pattern is the regular expression,
# word is the word to match.
def increment(dict: dict, pattern: str, word: str):
    match = re.match(pattern, word)
    if match:
        # re.match returns a Match object, not a string.
        # .group(n) returns n-s capture. .group() returns
        # 0th capture, i.e. the whole match:
        node = match.group()
        # Initialise the counter, if necessary:
        if not node in dict:
            dict[node] = 0
        # Increment the counter:
        dict[node] += 1

# filename is a string that contains a path to file to parse,
# patterns is a dictionary of patterns to check against,
# the function returns a dictionary.
def scores(filename: str, patterns: dict) -> dict:
    # Initialise the dictionary that keeps counters:
    d = dict()
    with open(filename, 'r') as file:
        for line in file:
            line = line.strip()
            for word in line.split():
                # Check against all patterns:
                for pattern in patterns:
                    increment(d, pattern, word)
    return d

# Patterns to search for.
# It is claimed that Python caches the compiled
# regular expressions, so that we don't need
# to pre-compile them:
patterns = [r"team1.*", r"team2.*"]

# file to parse:
filename = '/home/user/waste/nodes-prod.log'

# This is how a dictionary is iterated, when both key and value are needed:
for key, value in scores(filename, patterns).items():
    print(key, ":", value)

def increment(dict: dict, pattern: str, word: str): defines a function that receives a dictionary dict, pattern and the word to check against patern. and a Match object match. The parameters are typed, which is optional in Python.
def scores(filename: str, patterns: dict) -> dict: defines a function that receives filename as a string, a dictionary of patterns and returns another dictionary of match counts.

来源：https://stackoverflow.com/questions/64219282/count-occurrences-of-a-string-pattern-in-a-file-and-count

标签

python-3.x

regex

regex-group