sql >> データベース >  >> RDS >> PostgreSQL

多くの隣接する行でWHEREを使用してPostgresクエリを痛々しいほど遅くする

    ステップ1:ウィンドウ関数を使用して隣接を取得します 記録し、苦痛な自己参加を回避します(12テーブルは geqoの制限に非常に近いです 引き継ぐ):

    copy(
    WITH stuff AS (
        SELECT   c1.id , c1.source, c1.word
        , LEAD ( c1.word, 1) OVER (www) AS c2w
        , LEAD (c1.word, 2) OVER (www) AS c3w
        , LEAD ( c1.word, 3) OVER (www) AS c4w
        , LEAD (c1.lemma, 3) OVER (www) AS c4l
        , LEAD (c1.pos, 3) OVER (www) AS c4p
        , LEAD (c1.pos, 4) OVER (www) AS c5p
        , LEAD (c1.word, 4) OVER (www) AS c5w
        , LEAD (c1.word, 5) OVER (www) AS c6w
        , LEAD (c1.lemma, 5) OVER (www) AS c6l
        , LEAD (c1.word, 6) OVER (www) AS c7w
        , LEAD (c1.pos, 6) OVER (www) AS c7p
        , LEAD (c1.word, 7) OVER (www) AS c8w
        , LEAD (c1.word, 8) OVER (www) AS c9w
        , LEAD (c1.lemma, 8) OVER (www) AS c9l
        , LEAD (c1.pos, 8) OVER (www) AS c9p
        , LEAD (c1.word, 9) OVER (www) AS c10w
        , LEAD (c1.word, 10) OVER (www) AS c11w
        FROM orderedflatcorpus AS c1
        WINDOW www AS (ORDER BY id)
        )
    SELECT id ,  source, word
        , c2w
        , c3w
        , c4w
        , c4l
        , c4p
        , c5w
        , c6w
        , c7w
        , c8w
        , c9w
        , c9l
        , c9p
        , c10w
        , c11w
    FROM stuff
    WHERE 1=1
    AND c4p LIKE 'v%'
    AND c5p = 'appge'
    AND c6l = 'way'
    AND c7p LIKE 'i%'
    AND c8w = 'the'
    AND c9p LIKE 'n%'
    ORDER BY id
    )
    -- TO '/home/postgres/Results/OUTPUT.csv' DELIMITER E'\t' csv header;
    TO '/tmp/OUTPUT2.csv' DELIMITER E'\t' csv header;
    

    ステップ2:[データモデル] {word、lemma、pos}列はカーディナリティの低いグループのようです。それらを別のトークン/レンマ/pos-tableに絞り出すことができます:

        -- An index to speedup the unique extraction and final update
        -- (the index will be dropped automatically
        -- once the columns are dropped)
        CREATE INDEX ON tmp.orderedflatcorpus (word, lemma, pos );
    
        ANALYZE tmp.orderedflatcorpus;
        -- table containing the "squeezed out" domain
        CREATE TABLE tmp.words AS
         SELECT DISTINCT  word, lemma, pos
         FROM tmp.orderedflatcorpus
                ;
        ALTER TABLE tmp.words
         ADD COLUMN id SERIAL NOT NULL PRIMARY KEY;
    
        ALTER TABLE tmp.words
         ADD UNIQUE (word , lemma, pos );
    
        -- The original table needs an FK "link" to the new table
        ALTER TABLE tmp.orderedflatcorpus
          ADD column words_id INTEGER -- NOT NULL
          REFERENCES tmp.words(id)
          ;
        -- FK constraints are helped a lot by a supportive index.
        CREATE INDEX orderedflatcorpus_words_id_fk ON tmp.orderedflatcorpus (words_id)
         ;
        ANALYZE tmp.orderedflatcorpus;
        ANALYZE tmp.words;
        -- Initialize the FK column in the original table.
        --  we need NOT DISTINCT FROM here, since the joined
        --  columns could contain NULLs , which MUST compare equal.
        -- ------------------------------------------------------
        UPDATE tmp.orderedflatcorpus dst
           SET  words_id = src.id
          FROM tmp.words src
         WHERE src.word IS NOT DISTINCT FROM dst.word
           AND dst.lemma IS NOT DISTINCT FROM src.lemma
           AND dst.pos IS NOT DISTINCT FROM src.pos
                ;
        ALTER TABLE tmp.orderedflatcorpus
         DROP column word
         , DROP column lemma
         , DROP column pos
                ;
    

    そして、words-tableへのJOINを使用した新しいクエリ:

    copy(
    WITH stuff AS (
        SELECT   c1.id , c1.source, w.word
        , LEAD ( w.word, 1) OVER (www) AS c2w
        , LEAD (w.word, 2) OVER (www) AS c3w
        , LEAD ( w.word, 3) OVER (www) AS c4w
        , LEAD (w.lemma, 3) OVER (www) AS c4l
        , LEAD (w.pos, 3) OVER (www) AS c4p
        , LEAD (w.pos, 4) OVER (www) AS c5p
        , LEAD (w.word, 4) OVER (www) AS c5w
        , LEAD (w.word, 5) OVER (www) AS c6w
        , LEAD (w.lemma, 5) OVER (www) AS c6l
        , LEAD (w.word, 6) OVER (www) AS c7w
        , LEAD (w.pos, 6) OVER (www) AS c7p
        , LEAD (w.word, 7) OVER (www) AS c8w
        , LEAD (w.word, 8) OVER (www) AS c9w
        , LEAD (w.lemma, 8) OVER (www) AS c9l
        , LEAD (w.pos, 8) OVER (www) AS c9p
        , LEAD (w.word, 9) OVER (www) AS c10w
        , LEAD (w.word, 10) OVER (www) AS c11w
        FROM orderedflatcorpus AS c1
        JOIN words w ON w.id=c1.words_id
        WINDOW www AS (ORDER BY c1.id)
        )
    SELECT id ,  source, word
        , c2w , c3w
        , c4w , c4l , c4p
        , c5w
        , c6w
        , c7w
        , c8w
        , c9w , c9l , c9p
        , c10w
        , c11w
    FROM stuff
    WHERE 1=1
    AND c4p LIKE 'v%'
    AND c5p = 'appge'
    AND c6l = 'way'
    AND c7p LIKE 'i%'
    AND c8w = 'the'
    AND c9p LIKE 'n%'
    ORDER BY id
    )
    -- TO '/home/postgres/Results/OUTPUT.csv' DELIMITER E'\t' csv header;
    TO '/tmp/OUTPUT3.csv' DELIMITER E'\t' csv header;
    

    注:条件を少し緩和しすぎたため、出力に2行が表示されます...

    更新 :CTEを回避する最初のクエリ

    copy(
    SELECT id ,  source, word
            , c2w
            , c3w
            , c4w
            , c4l
            , c4p
            , c5w
            , c6w
            , c7w
            , c8w
            , c9w
            , c9l
            , c9p
            , c10w
            , c11w
    FROM (
            SELECT   c1.id , c1.source, c1.word
            , LEAD ( c1.word, 1) OVER (www) AS c2w
            , LEAD (c1.word, 2) OVER (www) AS c3w
            , LEAD ( c1.word, 3) OVER (www) AS c4w
            , LEAD (c1.lemma, 3) OVER (www) AS c4l
            , LEAD (c1.pos, 3) OVER (www) AS c4p
            , LEAD (c1.pos, 4) OVER (www) AS c5p
            , LEAD (c1.word, 4) OVER (www) AS c5w
            , LEAD (c1.word, 5) OVER (www) AS c6w
            , LEAD (c1.lemma, 5) OVER (www) AS c6l
            , LEAD (c1.word, 6) OVER (www) AS c7w
            , LEAD (c1.pos, 6) OVER (www) AS c7p
            , LEAD (c1.word, 7) OVER (www) AS c8w
            , LEAD (c1.word, 8) OVER (www) AS c9w
            , LEAD (c1.lemma, 8) OVER (www) AS c9l
            , LEAD (c1.pos, 8) OVER (www) AS c9p
            , LEAD (c1.word, 9) OVER (www) AS c10w
            , LEAD (c1.word, 10) OVER (www) AS c11w
            FROM orderedflatcorpus AS c1
            WINDOW www AS (ORDER BY id)
            ) stuff
    WHERE 1=1
    AND c4p LIKE 'v%'
    AND c5p = 'appge'
    AND c6l = 'way'
    AND c7p LIKE 'i%'
    AND c8w = 'the'
    AND c9p LIKE 'n%'
    ORDER BY id
    )
    -- TO '/home/postgres/Results/OUTPUT.csv' DELIMITER E'\t' csv header;
    TO '/tmp/OUTPUT2a.csv' DELIMITER E'\t' csv header;
    

    [2番目のクエリでも同様の変換を実行できます]

    UPDATE2 2つのテーブルバリアントのサブクエリバージョン。

    -- copy(
    -- EXPLAIN ANALYZE
    SELECT c1i, c1s, c1w
            , c2w , c3w
            , c4w , c4l , c4p
            , c5w
            , c6w
            , c7w
            , c8w
            , c9w , c9l , c9p
            , c10w
            , c11w
    FROM (
            SELECT c1.id AS c1i
            , c1.source AS c1s
            , w1.word AS c1w
            , LEAD (w1.word, 1) OVER www AS c2w
            , LEAD (w1.word, 2) OVER www AS c3w
            , LEAD (w1.word, 3) OVER www AS c4w
            , LEAD (w1.lemma, 3) OVER www AS c4l
            , LEAD (w1.pos, 3) OVER www AS c4p
            , LEAD (w1.pos, 4) OVER www AS c5p
            , LEAD (w1.word, 4) OVER www AS c5w
            , LEAD (w1.word, 5) OVER www AS c6w
            , LEAD (w1.lemma, 5) OVER www AS c6l
            , LEAD (w1.word, 6) OVER www AS c7w
            , LEAD (w1.pos, 6) OVER www AS c7p
            , LEAD (w1.word, 7) OVER www AS c8w
            , LEAD (w1.word, 8) OVER www AS c9w
            , LEAD (w1.lemma, 8) OVER www AS c9l
            , LEAD (w1.pos, 8) OVER www AS c9p
            , LEAD (w1.word, 9) OVER www AS c10w
            , LEAD (w1.word, 10) OVER www AS c11w
            FROM orderedflatcorpus c1
            JOIN words w1 ON w1.id=c1.words_id
            WHERE 1=1
    /*      These *could* to prune out unmatched items, but I could not get it to work ...
            AND EXISTS (SELECT *FROM orderedflatcorpus c4 JOIN words w4 ON w4.id=c4.words_id
                    WHERE c4.id = 3+c1.id -- AND w4.pos LIKE 'v%'
                    )  -- OMG
            AND EXISTS (SELECT *FROM orderedflatcorpus c5 JOIN words w5 ON w5.id=c5.words_id
                    WHERE c5.id = 4+c1.id -- AND w5.pos = 'appge'
                    ) -- OMG
            AND EXISTS (SELECT *FROM orderedflatcorpus c7 JOIN words w7 ON w7.id=c7.words_id
                    WHERE c7.id = 6+c1.id -- AND w7.pos LIKE 'i%'
                    ) -- OMG
            AND EXISTS (SELECT *FROM orderedflatcorpus c9 JOIN words w9 ON w9.id=c9.words_id
                    WHERE c9.id = 8+c1.id -- AND w9.pos LIKE 'n%'
                    ) -- OMG
            AND EXISTS (SELECT *FROM orderedflatcorpus c8 JOIN words w8 ON w8.id=c8.words_id
                    WHERE c8.id = 7+c1.id -- AND w8.word = 'the'
                    )  -- OMG
    */
             WINDOW www AS (ORDER BY c1.id ROWS BETWEEN CURRENT ROW AND 10 FOLLOWING)
            ) stuff
    WHERE 1=1
    AND c4p LIKE 'v%'
    AND c5p = 'appge'
    AND c6l = 'way'
    AND c7p LIKE 'i%'
    AND c8w = 'the'
    AND c9p LIKE 'n%'
    ORDER BY c1i
            ;
       -- )
    -- TO '/home/postgres/Results/OUTPUT.csv' DELIMITER E'\t' csv header;
    -- TO '/tmp/OUTPUT3b.csv' DELIMITER E'\t' csv header;
    


    1. PHPでmysqlLOADステートメントを使用すると失敗しますが、コマンドラインから実行すると機能します

    2. ONLY_FULL_GROUP_BYを無効にしない理由

    3. Oracleのバグにより、JSON_ARRAYAGGで重複する集計値が生成されます

    4. Spring Jdbctemplate.update(String sql、obj ... args)を使用して挿入されたIDを取得する方法