How to make fast the import of an excel file containing more than 5000 lines into sqlite database with django

倾然丶 夕夏残阳落幕 提交于 2020-06-29 04:38:07

问题


Import xls file (more than 5000 lines) into my sqlite database takes so long.

def importeradsl(request):
if "GET" == request.method:
    else:
        excel_file = request.FILES["excel_file"]
        #you may put validations here to check extension or file size
        wb = openpyxl.load_workbook(excel_file)
        #getting a particular sheet by name out of many sheets
        worksheet = wb["Sheet 1"]
        #iterating over the rows and getting value from each cell in row
        for row in worksheet.iter_rows(min_row=2):
            row_data = list()
            for cell in row:
                row_data.append(str(cell.value))
            #Get content fields DerangementCuivre models
            #Client
            nd = row_data[0]
            nom_client = row_data[3]
            nd_contact = row_data[4]
            #Categorie
            code_categorie = row_data[6]
            acces_reseau = row_data[8]
            etat = row_data[9]
            origine = row_data[10]
            code_sig = row_data[11]
            agent_sig = row_data[13]
            date_sig = dt.datetime.strftime(parse(row_data[14]), '%Y-%m-%d %H:%M:%S')
            date_essai = dt.datetime.strftime(parse(row_data[15]), '%Y-%m-%d %H:%M:%S')
            agent_essai = row_data[18]
            try:
                date_ori = dt.datetime.strptime(row_data[19], '%Y-%m-%d %H:%M:%S')
            except ValueError as e:
                print ("Vous", e)
            else:
                date_ori = dt.datetime.strftime(parse(row_data[19]), '%Y-%m-%d %H:%M:%S')
            agent_ori = row_data[20]
            code_ui = row_data[21]
            equipe = row_data[22]
            sous_traitant = row_data[23]
            date_pla = dt.datetime.strftime(parse(row_data[24]), '%Y-%m-%d %H:%M:%S')
            date_rel = dt.datetime.strftime(parse(row_data[25]), '%Y-%m-%d %H:%M:%S')
            date_releve = dt.datetime.strptime(row_data[25], '%Y-%m-%d %H:%M:%S')
            date_essais = dt.datetime.strptime(row_data[15], '%Y-%m-%d %H:%M:%S')
            pst = pytz.timezone('Africa/Dakar')
            date_releve = pst.localize(date_releve)
            utc = pytz.UTC
            date_releve = date_releve.astimezone(utc)
            date_essais = pst.localize(date_essais)
            date_essais = date_essais.astimezone(utc)
            code_rel = row_data[26]
            localisation = row_data[27]
            cause = row_data[28]
            commentaire = row_data[29]
            agent_releve = row_data[30]
            centre_racc = row_data[32]
            rep = row_data[33]
            srp = row_data[34]
            delai = (date_releve - date_essais).total_seconds()
            dali = divmod(delai, 86400)[0]
            semaine = date_releve.isocalendar()[1]
            mois = date_releve.month
            annee = date_releve.year
            if dali > 7:
                etats = "PEX PLUS"
            else:
                etats = "PEX"
            #Enregistrer un client
            Client(nd=nd, nom=nom_client, mobile=nd_contact).save()
            #Enregistrer la categorie
            #Code pour nom categorie - renseigner plus tard
            Categorie(code_categorie=code_categorie, nom="Public").save()
            #Enregistrer agent de signalisation
            AgentSig(matricule=agent_sig, nom="Awa").save()
            #Enregistrer agent d'essai
            AgentEssai(matricule=agent_essai).save()
            #Enregister agent d'orientation
            AgentOri(matricule=agent_ori).save()
            #Enregistrer agent de relève
            AgentRel(matricule=agent_releve).save()
            #Enregistrer le sous-traitant
            SousTraitant(nom=sous_traitant).save()
            #Enregistrer le centre
            Centre(code=centre_racc).save()
            #Enregistrer ui
            UniteIntervention(code_ui=code_ui, 
            sous_traitant=SousTraitant.objects.get(nom=sous_traitant)).save()
            #Enregistrer le repartiteur
            Repartiteur(code=rep, crac=Centre.objects.get(code=centre_racc)).save()
            #Enregistrer team
            Equipe(nom=equipe, unite=UniteIntervention.objects.get(code_ui=code_ui)).save()
            #Enregistrer le SR
            SousRepartiteur(code=srp, rep=Repartiteur.objects.get(code=rep)).save()
            #Enregistrer le drangement
            DerangementAdsl(acces_reseau=acces_reseau,
                            nd_client=Client.objects.get(nd=nd),
                            categorie=Categorie(code_categorie=code_categorie),
                            etat=etat,
                            origine=origine,
                            code_sig=code_sig,
                            agent_sig=AgentSig.objects.get(matricule=agent_sig),
                            date_sig=date_sig,
                            date_essai=date_essai,
                            agent_essai=AgentEssai.objects.get(matricule=agent_essai),
                            date_ori=date_ori,
                            agent_ori=AgentOri.objects.get(matricule=agent_ori),
                            sous_traitant=SousTraitant.objects.get(nom=sous_traitant),
                            unite_int = UniteIntervention.objects.get(code_ui=code_ui),
                            date_pla=date_pla,
                            date_rel=date_rel,
                            code_rel=code_rel,
                            code_local=localisation,
                            cause=cause,
                            comment_cause=commentaire,
                            agent_rel=AgentRel.objects.get(matricule=agent_releve),
                            centre=Centre.objects.get(code=centre_racc),
                            rep=Repartiteur.objects.get(code=rep),
                            srep=SousRepartiteur.objects.get(code=srp),
                            delai=dali,
                            etat_vr=etats,
                            semaine=semaine,
                            mois=mois,
                            annee=annee).save()

回答1:


There are few things that are incorrect. I propose to you the following approach:

  1. Make your code more readable
  2. Remove useless queries
  3. Avoid related records duplication
  4. Cache out your related instances.
  5. Use bulk_create

Looking at your code, with a rough estimation, per csv record, you will get over 30 SQL queries per row, that's a bit much...

1. Make you code more readable.

Your parsing logic can be DRYed, a lot.

First, identify what you do with your data. From my point of view, 2 main functions:

Do nothing:

def no_transformation(value)
    return str(value)

Parse dates

def strptime(value):
    """
    I can't really tell what your 'parse' function does, I let it be but it might 
    be interesting adding your logic in here
    """
    return dt.datetime.strptime(parse(str(value)), '%Y-%m-%d %H:%M:%S')

Now, you can declare your parser configuration:

PARSER_CONFIG=(
    #(column_index, variable_name, transformation_function)
    (0,'nd',no_transformation),
    (10,'origine',no_transformation),
    (11,'code_sig',no_transformation),
    (13,'agent_sig',no_transformation),
    (14,'date_sig',strptime),
    (15,'date_essai',strptime),
    (18,'agent_essai',no_transformation),
    (19,'date_ori',strptime),
    (20,'agent_ori',no_transformation),
    (21,'code_ui',no_transformation),
    (22,'equipe',no_transformation),
    (23,'sous_traitant',no_transformation),
    (24,'date_pla',strptime),
    (25,'date_rel',strptime),
    (26,'code_rel',no_transformation),
    (27,'localisation',no_transformation),
    (28,'cause',no_transformation),
    (29,'commentaire',no_transformation),
    (3,'nom_client',no_transformation),
    (30,'agent_releve',no_transformation),
    (32,'centre_racc',no_transformation),
    (33,'rep',no_transformation),
    (34,'srp',no_transformation),
    (4,'nd_contact',no_transformation),
    (6,'code_categorie',no_transformation),
    (8,'acces_reseau',no_transformation),
    (9,'etat',no_transformation),
    (15',date_essais',strptime),
    (19',date_ori',strptime),
    (25',date_releve',strptime),
)

Now, you know how to parse your data, and how to name it. Let just put that stuff into a dict.

def parse(row):
    """Transform a row into a dict
    
    Args:
        row (tuple): Your row's data
    
    Returns:
        dict: Your parsed data, named into a dict.
    """
    return {
        key:tranfsorm(row[index]) for index, key, transform in PARSER_CONFIG
    }

From here, your parser is way more readable, you know exactly what you're doing with your data.

Wrapping this up all together, you should get:

PARSER_CONFIG=(
    #(column_index, variable_name, transformation_function)
    #...
)
def no_transformation(value)
    return str(value)

def strptime(value)
    return str(value)

def parse(row):
    """Transform a row into a dict
    
    Args:
        row (tuple): Your row's data
    
    Returns:
        dict: Your parsed data, named into a dict.
    """
    return {
        key:tranfsorm(row[index]) for index, key, transform in PARSER_CONFIG
    }

for row in rows:
    item = parse(row) #< Your data, without related instances yet....

Still have some work to create your related instances, but we'll get there eventually.

2. Removing useless queries.

You do :

#...First, your create a record
Client(nd=nd, nom=nom_client, mobile=nd_contact).save()
#... Then you fetch it when saving DerangementAdsl
nd_client=Client.objects.get(nd=nd)

While a more pythonic way of doing this would be:

#... You create and assign your istance.
client = Client(nd=item.get('nd'), 
                nom=item.get('nom_client'), 
                mobile=item.get('nd_contact')).save()
#...
nd_client=client

You just earned one SQL query/row! Doing the same logic for each models, and you'll earn around 20 queries per row!

categorie=Categorie.objects.create(code_categorie=item.get('code_categorie'), nom="Public"),
#Enregistrer agent de signalisation
agent_sig=AgentSig.objects.create(matricule=item.get('agent_sig'), nom="Awa"),
#Enregistrer agent d'essai
agent_essai=AgentEssai.objects.create(matricule=item.get('agent_essai')),
#Enregister agent d'orientation
agent_ori=AgentOri.objects.create(matricule=item.get('agent_ori')),
#Enregistrer agent de relève
agent_rel=AgentRel.objects.create(matricule=item.get('agent_releve')),
#Enregistrer le sous-traitant
sous_traitant=SousTraitant.objects.create(nom=item.get('sous_traitant')),
#Enregistrer le centre
centre=Centre.objects.create(code=item.get('centre_racc')),
#Enregistrer ui
unite_int=UniteIntervention.objects.create(code_ui=item.get('code_ui'), sous_traitant=sous_traitant), # < You earn one extrat query with sous_traitant
#Enregistrer le repartiteur
rep=Repartiteur.objects.create(code=item.get('rep'), crac=centre), # < You earn one extrat query with centre
#Enregistrer team
equipe=Equipe.objects.create(nom=item.get('equipe')), unite=unite_int),# < You earn one extrat query with unite_int
#Enregistrer le SR
srep=SousRepartiteur.objects.create(code=item.get('srp'), rep=rep),# < You earn one extrat query with rep

3. Avoid related records duplication

Now there is one big issue:

Considering you have multiple rows for each client, you'll eventually find yourself with many duplicates, and you do not want that. Instead of using create, you should go with get_or_create.

Please note it returns a tuple: (instance, created) So.... your code should go like:

categorie, categorie_created=Categorie.objects.get_or_create(code_categorie=item.get('code_categorie'), nom="Public"),
agent_sig, agent_sig_created=AgentSig.objects.get_or_create(matricule=item.get('agent_sig'), nom="Awa"),
agent_essai, agent_essai_created=AgentEssai.objects.get_or_create(matricule=item.get('agent_essai')),
agent_ori, agent_ori_created=AgentOri.objects.get_or_create(matricule=item.get('agent_ori')),
agent_rel, agent_rel_created=AgentRel.objects.get_or_create(matricule=item.get('agent_releve')),
sous_traitant, sous_traitant_created=SousTraitant.objects.get_or_create(nom=item.get('sous_traitant')),
centre, centre_created=Centre.objects.get_or_create(code=item.get('centre_racc')),
unite_int, unite_int_created=UniteIntervention.objects.get_or_create(code_ui=item.get('code_ui'), sous_traitant=sous_traitant)
rep, rep_created=Repartiteur.objects.get_or_create(code=item.get('rep'), crac=centre)
equipe, equipe_created=Equipe.objects.get_or_create(nom=item.get('equipe')), unite=unite_int
srep, srep_created=SousRepartiteur.objects.get_or_create(code=item.get('srp'), rep=rep)

Tadaaaaam, you'll create records that are "only" necessary for your related objects.

4. Caching out your related objects.

As in previous topic, I consider you have multiple rows for each related instance, and for each row, you will still get to fetch that from your DB.

It's OK I guess if you're using SQLite in memory, it won't be as slow as with other DBs, still, it'll be a bottleneck. You could use an approach like:

MODEL_CACHE = {}
def get_related_instance(model, **kwargs):
    key = (model,kwargs)
    if key in MODEL_CACHE:
        return instance MODEL_CACHE[key]
    else:
        instance, create = model.objects.get_or_create(**kwargs)
        MODEL_CACH[key]=instance
    return instance

# Instead of having previous lines now you end up with:
categorie = get_related_instance(Categorie,code_categorie=item.get('code_categorie'), nom="Public"),
agent_sig = get_related_instance(AgentSig,matricule=item.get('agent_sig'), nom="Awa"),
agent_essai = get_related_instance(AgentEssai,matricule=item.get('agent_essai')),
agent_ori = get_related_instance(AgentOri,matricule=item.get('agent_ori')),
agent_rel = get_related_instance(AgentRel,matricule=item.get('agent_releve')),
sous_traitant = get_related_instance(SousTraitant,nom=item.get('sous_traitant')),
centre = get_related_instance(Centre,code=item.get('centre_racc')),
unite_int = get_related_instance(UniteIntervention,code_ui=item.get('code_ui'), sous_traitant=sous_traitant)
rep = get_related_instance(Repartiteur,code=item.get('rep'), crac=centre)
equipe = get_related_instance(Equipe,nom=item.get('equipe')), unite=unite_int
srep = get_related_instance(SousRepartiteur,code=item.get('srp'), rep=rep)

I cannot tell how much you'll gain thanks to that, it really depends on the data set you're trying to import, but from experience, it's quite drastic!

5 Use bulk_create

You are doing

for row in rows:
    DerangementAdsl(...your data...).save() #<That's one DB call

That's one SQL query per row, while you could do:

ITEMS = []
for row in rows:
    #...Your parsing we saw previously...
    ITEMS.append(DerangementAdsl(**item))
DerangementAdsl.objects.bulk_create(ITEMS) #<That's one DB call

Putting it all together!

PARSER_CONFIG=(
    #(column_index, variable_name, transformation_function)
    #...
)
def no_transformation(value)
    return str(value)

def strptime(value)
    return str(value)

MODEL_CACHE = {}

def get_related_instance(model, **kwargs):
    key = (mode,kwargs)
    if key in MODEL_CACHE:
        return instance MODEL_CACHE[key]
    else:
        instance, create = model.objects.get_or_create(**kwargs)
        MODEL_CACH[key]=instance
    return instance

def parse(row):
    """Transform a row into a dict
    
    Args:
        row (tuple): Your row's data
    
    Returns:
        dict: Your parsed data, named into a dict.
    """
    item= {
        key:tranfsorm(row[index]) for index, key, transform in PARSER_CONFIG
    }
    item.update({
        'categorie': get_related_instance(Categorie,code_categorie=item.get('code_categorie'), nom="Public"),
        'agent_sig': get_related_instance(AgentSig,matricule=item.get('agent_sig'), nom="Awa"),
        'agent_essai': get_related_instance(AgentEssai,matricule=item.get('agent_essai')),
        'agent_ori': get_related_instance(AgentOri,matricule=item.get('agent_ori')),
        'agent_rel': get_related_instance(AgentRel,matricule=item.get('agent_releve')),
        'sous_traitant': get_related_instance(SousTraitant,nom=item.get('sous_traitant')),
        'centre': get_related_instance(Centre,code=item.get('centre_racc')),
        'unite_int': get_related_instance(UniteIntervention,code_ui=item.get('code_ui'), sous_traitant=sous_traitant)
        'rep': get_related_instance(Repartiteur,code=item.get('rep'), crac=centre)
        'equipe': get_related_instance(Equipe,nom=item.get('equipe')), unite=unite_int
        'srep': get_related_instance(SousRepartiteur,code=item.get('srp'), rep=rep)
    })
    return item

def importeradsl(request):
    #I skip your conditions for readility
    ITEMS = []
    for row in worksheet.iter_rows(min_row=2):
        ITEMS.append(DerangementAdsl(**parse(row)))
    
    DerangementAdsl.objects.bulk_create(ITEMS)

Conclusion

Following those recommendation, you should end up with an optimized script that will run way faster than the original one, and be way more readable and pythonic

Roughly, depending on your dataset, 5k lines should run somewhere between 10 seconds up to few minutes.

If each row's related instance (client,category...) is unique, I'd use a more sophisticated approach looping multiple times over your dataset to create related models using bulk_create and cache them out like:

CLIENTS = []
for row in rows:
    CLIENTS.append(Client(**client_parser(row)))
clients=Client.objects.bulk_create(CLIENTS) # You Create *all* your client with only one DB call!

Then, you cache all created clients. You do the same for all your related models and eventually you'll load your data making a dozen of DB calls, but it really depends on your business logic here: It should be engineered to handle duplicated records too.



来源:https://stackoverflow.com/questions/58418482/how-to-make-fast-the-import-of-an-excel-file-containing-more-than-5000-lines-int

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!