Oracle SQL: the insert query with REGEXP_SUBSTR expression is very long ( split string )

问题

I must insert into table 2 fields (first the Primary key(about the articles) and the second concerns their size(of these articles).

In source envrionnement, i have into table, the primary key(TK Articles) and a concatenation of a size into second field. However, i must insert into target table, the TK Articles and the several size of the Artcles. For example,

Source:

ART        SIZE**                                      
1        |  28/30   
2        |  30/32   
3        | Size 10/Size 12/Size 14/Size 14

Target:

ART         Size
1        |   28  
1        |   30                   
2        |   30            
2        |   32             
3        |  Size 10         
3        |  Size 12       
3        |  Size 14      
3        |  Size 16

The difficulty is to know how many '/' is included in the field?

I have made a query

SELECT ART,
       REGEXP_SUBSTR(SIZE,'[^/]+',1,level)
FROM TABLLE
CONNECT BY REGEXP_SUBSTR(SIZE,'[^/]+',1,level) IS NOT NULL;

the select transaction works and display results in 46 seconds. But the TABLE have 100 000 lines and the insert transaction is too long and doesn't work.

Somebody can help me on this point?

Thanks & Regards

回答1:

Regular expressions are very expensive to compute. If there is a need to process a large number of rows, personally I would go with a stored procedure - pipelined table function:

-- table with 100000 rows
create table Tb_SplitStr(col1, col2) as
  select level
       , 'Size 10/Size 12/Size 14/Size 14/Size 15/Size 16/Size 17'
   from dual
  connect by level <= 100000

PL/SQL package:

create or replace package Split_Pkg as
  type T_StrList is table of varchar2(1000);
  function Str_Split(
     p_str in varchar2,
     p_dlm in varchar2
  ) return T_StrList pipelined;
end;

create or replace package body Split_Pkg as
  function Str_Split(
     p_str in varchar2,
     p_dlm in varchar2
  ) return T_StrList pipelined
  is
     l_src_str  varchar2(1000) default p_str;
     l_dlm_pos  number;
  begin
     while l_src_str is not null
     loop
        l_dlm_pos := instr(l_src_str, p_dlm);
        case
          when l_dlm_pos = 0
          then pipe row (l_src_str);
               l_src_str := '';
          else pipe row(substr(l_src_str, 1, l_dlm_pos - 1));
               l_src_str := substr(l_src_str, l_dlm_pos + 1);
        end case;
     end loop;
     return;
  end; 
end;

SQL Query with regexp functions:

with ocrs(ocr) as(
   select level
     from ( select max(regexp_count(col2, '[^/]+')) as mx
              from tb_splitStr) t
   connect by level <= t.mx
)
select count(regexp_substr(s.col2, '[^/]+', 1, o.ocr)) as res
  from tb_splitStr s
   cross join ocrs o

Result:

-- SQL with regexp
SQL> with ocrs(ocr) as(
  2    select level
  3     from ( select max(regexp_count(col2, '[^/]+')) as mx
  4              from tb_splitStr) t
  5    connect by level <= t.mx
  6  )
  7  select count(regexp_substr(s.col2, '[^/]+', 1, o.ocr)) as res
  8    from tb_splitStr s
  9     cross join ocrs o
 10  ;

Res
------------------------------
                        700000
Executed in 4.093 seconds

SQL> /

Res
------------------------------
                        700000
Executed in 3.812 seconds



--Query with pipelined table function  
SQL> select count(*)
  2    from Tb_SplitStr s
  3    cross join table(split_pkg.Str_Split(s.col2, '/'))
  4  ;

 COUNT(*)
----------
    700000
Executed in 2.469 seconds

SQL> /

COUNT(*)
----------
    700000
Executed in 2.406 seconds

回答2:

This blogpost of mine shows six different techniques to handle this query.

The difference is that it handles dates and you need to handle string. You can solve this by using "regexp_count(size,'/') + 1" as your iteration-stopper and regexp_substr(size,'[^/]+',1,i) in the select-list.

回答3:

How about using some XML?

> set serveroutput on
> drop table test_tab
table TEST_TAB dropped.
> create table test_tab
(
art number,
siz varchar2(100)
)
table TEST_TAB created.
> insert into test_tab values (1, '28/30')
1 rows inserted.
> insert into test_tab values (2, '30/32')
1 rows inserted.
> insert into test_tab values (3, 'Size 10/Size 12/Size 14/Size 14')
1 rows inserted.
> commit
committed.
> drop table test_tab2
table TEST_TAB2 dropped.
> create table test_tab2 as
select * from test_tab where 1=0
table TEST_TAB2 created.
> insert into test_tab2 (art, siz)
select art, extractvalue(x.column_value, 'e')
from test_tab, xmltable ('e' passing xmlparse( content  '<e>' || replace(siz, '/', '</e><e>') || '</e>')) x
8 rows inserted.
> commit
committed.
> select * from test_tab2
       ART SIZ                                                                                                
---------- ----------------------------------------------------------------------------------------------------
         1 28                                                                                                   
         1 30                                                                                                   
         2 30                                                                                                   
         2 32                                                                                                   
         3 Size 10                                                                                              
         3 Size 12                                                                                              
         3 Size 14                                                                                              
         3 Size 14                                                                                              

 8 rows selected

Here it is again, but with 100,000 rows initially, and showing timings. Insert of 400,000 rows took just over 2 minutes:

> set serveroutput on
> set timing on
> drop table test_tab
table TEST_TAB dropped.
Elapsed: 00:00:00.055
> create table test_tab
(
art number,
siz varchar2(100)
)
table TEST_TAB created.
Elapsed: 00:00:00.059
> --insert into test_tab values (1, '28/30');
> --insert into test_tab values (2, '30/32');
> --insert into test_tab values (3, 'Size 10/Size 12/Size 14/Size 14');
> insert into test_tab (art, siz)
select level, 'Size 10/Size 12/Size 14/Size 16'
  from dual
  connect by level <= 100000
100,000 rows inserted.
Elapsed: 00:00:00.191
> commit
committed.
Elapsed: 00:00:00.079
> drop table test_tab2
table TEST_TAB2 dropped.
Elapsed: 00:00:00.081
> create table test_tab2 as
select * from test_tab where 1=0
table TEST_TAB2 created.
Elapsed: 00:00:00.076
> -- perform inserts.  This will result in 400,000 rows inserted
> -- note inserts are done conventionally (timing is acceptable)
> insert into test_tab2 (art, siz)
select art, extractvalue(x.column_value, 'e')
from test_tab, xmltable ('e' passing xmlparse( content  '<e>' || replace(siz, '/', '</e><e>') || '</e>')) x
400,000 rows inserted.
Elapsed: 00:02:17.046
> commit
committed.
Elapsed: 00:00:00.094
> -- show some data in target table
> select * from test_tab2
where art = 1
       ART SIZ                                                                                                
---------- ----------------------------------------------------------------------------------------------------
         1 Size 10                                                                                              
         1 Size 12                                                                                              
         1 Size 14                                                                                              
         1 Size 16                                                                                              

Elapsed: 00:00:00.103

来源：https://stackoverflow.com/questions/18787116/oracle-sql-the-insert-query-with-regexp-substr-expression-is-very-long-split

标签

sql

regex

Oracle

substr