poj4052

题意:求一个文章(长度5.1e6)里面出现了多少个指定的模式串。重复出现只记一次。而且如果两个模式串都出现的情况下,一个是另一个的子串,则该子串不算出现过。

分析:AC自动机。

由于子串不算所以加一些特殊处理:

1.在文章匹配过程中,如果出现了一个模式串我们不是把匹配数量+1,而是记录那个出现过vis[id] = true;,当然trie树种也是记录了模式串的id。

2.在匹配结束后,我们遍历所有出现过的模式串,在Trie树种找到其所有出现过的子串并将其标为未出现过vis[id] = false;

要如何查找子串呢?

只需要记录每个出现过的串所对应的Trie树中的节点位置,由该节点向上走到root。其间走过的每个节点都沿着fail指针走到root一次。这样二重循环遍历到的所有节点就对应了Trie中所有该模式串的子串。

因为在AC自动机中父节点指针就是找前缀,fail指针就是找后缀。

#include <cstdio>
#include <cstring>
#include <queue>
#include <algorithm>
#include <cctype>
using namespace std;

#define D(x)
const int MAX_LEN = (int)(5.1e6) + 10;
const int MAX_N = 2505;
const int MAX_FINGER_LEN = 1105;
const int MAX_CHILD_NUM = 26;
const int MAX_NODE_NUM = MAX_N * MAX_FINGER_LEN;

int n;
char st[MAX_LEN];
char st2[MAX_LEN];
int vis[MAX_N];
bool check[MAX_NODE_NUM];

struct Trie
{
    int next[MAX_NODE_NUM][MAX_CHILD_NUM];
    int fail[MAX_NODE_NUM];
    int count[MAX_NODE_NUM];
    int father[MAX_NODE_NUM];
    int node_cnt;
    int root;

    void init()
    {
        node_cnt = 0;
        root = newnode();
    }

    int newnode()
    {
        for (int i = 0; i < 26; i++)
            next[node_cnt][i] = -1;
        count[node_cnt++] = 0;
        return node_cnt - 1;
    }

    int get_id(char a)
    {
        return a - ‘A‘;
    }

    void insert(char buf[], int index)
    {
        int len = strlen(buf);
        int now = root;
        for (int i = 0; i < len; i++)
        {
            int id = get_id(buf[i]);
            if (next[now][id] == -1)
            {
                next[now][id] = newnode();
                father[next[now][id]] = now;
            }
            now = next[now][id];
        }
        count[now] = index;
    }

    void build()
    {
        queue<int>Q;
        fail[root] = root;
        father[root] = root;
        for (int i = 0; i < 26; i++)
            if (next[root][i] == -1)
                next[root][i] = root;
            else
            {
                fail[next[root][i]] = root;
                Q.push(next[root][i]);
            }
        while (!Q.empty())
        {
            int now = Q.front();
            Q.pop();
            for (int i = 0; i < 26; i++)
                if (next[now][i] == -1)
                    next[now][i] = next[fail[now]][i];
                else
                {
                    fail[next[now][i]]=next[fail[now]][i];
                    Q.push(next[now][i]);
                }
        }
    }

    int query(char buf[])
    {
        int now = root;
        int res = 0;
        for (int i = 0; buf[i]; i++)
        {
            now = next[now][get_id(buf[i])];
            int temp = now;
            while (temp != root && !check[temp])
            {
                if (count[temp] != 0)
                    vis[count[temp]] = temp;
                check[temp] = true;
                temp = fail[temp];
            }
        }
        return res;
    }

    void debug()
    {
        for(int i = 0;i < node_cnt;i++)
        {
            printf("id = %3d,fail = %3d,end = %3d,chi = [",i,fail[i],count[i]);
            for(int j = 0;j < 26;j++)
                printf("%2d",next[i][j]);
            printf("]\n");
        }
    }

    void cal()
    {
        for (int i = 1; i <= n; i++)
        {
            if (vis[i] == 0)
            {
                continue;
            }
            int temp = vis[i];
            while (temp != root)
            {
                int temp2 = temp;
                while (temp2 != root && !check[temp2])
                {
                    if (count[temp2] != 0 && count[temp2] != i)
                    {
                        vis[count[temp2]] = 0;
                        check[temp2] = true;
                    }
                    temp2 = fail[temp2];
                }
                temp = father[temp];
            }
        }
    }
};

Trie ac;

void transform(char st[], char st2[])
{
    int len = 0;
    for (int i = 0; st[i]; i++)
    {
        if (isupper(st[i]))
        {
            st2[len++] = st[i];
            continue;
        }
        i++;
        int temp = 0;
        while (isdigit(st[i]))
        {
            temp *= 10;
            temp += st[i] - ‘0‘;
            i++;
        }
        for (int j = 0; j < temp; j++)
        {
            st2[len + j] = st[i];
        }
        len += temp;
        i++;
    }
    st2[len] = 0;
}

void input()
{
    scanf("%d", &n);
    for (int i = 1; i <= n; i++)
    {
        scanf("%s", st);
        transform(st, st2);
        ac.insert(st2, i);
    }
}

int work()
{
    memset(vis, 0, sizeof(vis));
    memset(check, 0, sizeof(check));
    ac.query(st2);
    memset(check, 0, sizeof(check));
    ac.cal();
    int ret = 0;
    for (int i = 1; i <= n; i++)
    {
        if (vis[i])
        {
            D(printf("#%d\n", vis[i]));
            ret++;
        }
    }
    return ret;
}

int main()
{
    int t;
    scanf("%d", &t);
    while (t--)
    {
        ac.init();
        input();
        ac.build();
        scanf("%s", st);
        transform(st, st2);
        printf("%d\n", work());
    }
    return 0;
}

时间: 2024-12-04 23:25:51

poj4052的相关文章