Create multiple task in airflow using loop

寵の児 提交于 2021-02-11 13:46:49

问题


I want to create task which will be update columns rows and send mail for every line in data table. At the moment I create task which download the data from main table. I cannot create tasks for every line in temp data table. Could you tell what I doing wrong and how I can generate and run tasks in lopp?

from datetime import datetime, timedelta

import airflow
from airflow import DAG
from airflow.contrib.operators.bigquery_operator import BigQueryOperator

from airflow.contrib.operators.bigquery_get_data import BigQueryGetDataOperator
from airflow.contrib.operators.bigquery_check_operator import BigQueryValueCheckOperator
from airflow.operators import PythonOperator
from airflow.operators.python_operator import PythonOperator

default_args = {
    'owner': 'cmap',
    'depends_on_past': False,
    'start_date': airflow.utils.dates.days_ago(0),
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 0,
    'retry_delay': timedelta(minutes=5),
}


with DAG('dq_bigquery_test',
         max_active_runs=1,
         schedule_interval='@once',
         catchup=False,
         default_args=default_args) as dag:

    query = "SELECT * from `dbce-bi-prod-e6fd.dev_dataquality.data_logging_inc` where MailRequired = false"
    insert = "INSERT into dbce-bi-prod-e6fd.dev_dataquality.data_logging_inc (DataTimeStamp, Robot, Status) Values (CURRENT_TIMESTAMP(), 'TestRobot', 'Test')"

    my_bq_task = BigQueryOperator(
                    task_id='query_exc_on_teste',
                    sql=query,
                    write_disposition='WRITE_TRUNCATE',
                    create_disposition='CREATE_IF_NEEDED',
                    bigquery_conn_id='google_cloud_dbce_bi_prod',
                    use_legacy_sql=False,
                    destination_dataset_table='dev_dataquality.testTable')



    get_data = BigQueryGetDataOperator(
        task_id='get_data_from_query',
        project_id='dbce-bi-prod-e6fd',
        dataset_id='dev_dataquality',
        table_id='testTable',
        max_results='100',
        selected_fields='Robot,Status,MailRequired',
        bigquery_conn_id='google_cloud_dbce_bi_prod'
        )

    def process_data_from_bq(**kwargs):


        ti = kwargs['ti']
        update_column = []
        bq_data = ti.xcom_pull(task_ids='get_data_from_query')
        print(bq_data)
        # Now bq_data here would have your data in Python list
        for index, i in enumerate(bq_data):


            update_query = "UPDATE `dbce-bi-prod-e6fd.dev_dataquality.data_logging_inc` SET MailSent = True WHERE Robot = '{}'".format(i[0])

            print(update_query)
            update_column.append(BigQueryOperator(
                    task_id='update_column_{}'.format(index),
                    sql=update_query,
                    write_disposition='WRITE_EMPTY',
                    create_disposition='CREATE_IF_NEEDED',
                    bigquery_conn_id='google_cloud_dbce_bi_prod',
                    use_legacy_sql=False,
                    dag=dag
                    ))
            if index not in [0]:
                update_column[index-1] >> update_column[index]                    


    process_data = PythonOperator(
        task_id='process_data_from_bq',
        python_callable=process_data_from_bq,
        provide_context=True
        )



    my_bq_task >> get_data >> process_data

Thank you for your help!

来源:https://stackoverflow.com/questions/61719690/create-multiple-task-in-airflow-using-loop

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!