HDU4782 Beautiful Soup

成都赛里的一道坑爹码力题，突然间脑抽想做一下弥补一下当时的遗憾。当时没做出这道题一是因为当时只剩大概45分钟，对于这样的具有各种条件的题无从下手，二则是因为当时估算着已经有银牌了，所以就不挣扎了。但是像这种题还是一定要敲一下的。

这学期学了编译原理，知道了一些在编译上处理这种题目的一些姿势，例如自动机，parse
tree什么的，所以写起来就会更清晰。其实说白了本题的难点在于tokenizer，就是将里面有意义的部分全部弄出来，归结起来可以看成4种，分别是opentag,closetag,blanktag,string，根据有无<>以及/的位置就可以确定下是属于哪一种。然后题目最麻烦的其实就正如它文末的那句，“You
quickly realize that your only job is to deal with the white
spaces.”

吃空格可以用下面的一句while解决 while((c=getchar)&&isSpace(c));
这样就可以得到第一个不是空格的字符，然后就是不停的吃后面的字符，当吃到空格或者是<的时候就表示到了一个分隔符，前面的字符串就可以先弄出来了。注意的是‘<’会作为下一个token的第一个字符，所以要加个save表示是否存起来。

tokenizer的机理大概是这样的。
首先要得到下一个token的第一个字符，如果save==true，表示已经有了，否则利用while((c=getchar)&&isSpace(c))吃出第一个非空字符。根据第一个字符类型判断是string还是tag，如果是tag就不停地吃直到吃到的字符是‘>‘，如果是string，就不停的吃吃到第一个分隔符。

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

#pragma warning(disable:4996)

#include <iostream>

#include <cstring>

#include <string>

#include <cstdio>

using
namespace std;

#define maxn 1000000

#define INIT 0

#define OPENTAG 1

#define CLOSETAG 2

#define BLANKTAG 3

#define STRING 4

#define END 5

char
token[maxn];

int
cur;

int
tokenType;

int
lastType;

int
indent;

bool
save;

char
savechar;

bool
isSpace(char
c){

return
c == ‘ ‘ || c == ‘ ‘
|| c == ‘\n‘;

}

bool
isdel(char
c){

return
c == ‘ ‘ || c == ‘ ‘
|| c == ‘\n‘
|| c == ‘<‘;

}

void
printSpace()

{

for
(int i = 0; i < indent; i++){

putchar(‘ ‘);

}

void
nextToken()

{

char
c; cur = 0;

if
(save){

c = savechar; token[cur++] = c; save = false;

}

else{

while
((c = getchar()) && isSpace(c));

token[cur++] = c;

}

if
(token[0] == ‘<‘){

while
((c = getchar())&&c!=‘>‘){

token[cur++] = c;

}

token[cur++] = c;

if
(token[cur - 2] == ‘/‘) tokenType = BLANKTAG;

else
if (token[1] == ‘/‘) tokenType = CLOSETAG;

else
tokenType = OPENTAG;

token[cur++] = ‘\0‘; return;

}

else{

while
((c = getchar()) && !isdel(c)){

token[cur++] = c;

}

if
(c == ‘<‘){

save = true; savechar = ‘<‘;

}

token[cur++] = ‘\0‘; tokenType = STRING;

}

int
main()

{

//freopen("in.txt", "r", stdin);

//freopen("out.txt", "w", stdout);

int
T; cin >> T; int
ca = 0; bool
endcase = false;

indent = 0; lastType = INIT; save = false;

while
(1)

{

if
(!endcase) {

printf("Case #%d:\n", ++ca); endcase = true;

}

nextToken();

if
(tokenType == OPENTAG){

if
(lastType == STRING) puts("");

printSpace();

printf("%s\n", token);

++indent;

}

else
if (tokenType == CLOSETAG){

if
(lastType == STRING) puts("");

--indent;

printSpace();

printf("%s\n", token);

}

else
if (tokenType == BLANKTAG){

if
(lastType == STRING) puts("");

printSpace();

printf("%s\n", token);

}

else{

if
(lastType == STRING) putchar(‘ ‘);

else
{

printSpace();

}

printf("%s", token);

}

if
(strcmp(token, "</html>") == 0){

endcase = false; lastType = INIT; indent = 0;

if
(ca == T){

break;

}

lastType = tokenType;

}

return
0;

}

时间： 2024-08-30 15:08:13

HDU4782 Beautiful Soup

HDU4782 Beautiful Soup的相关文章

HDU4782——Beautiful Soup（模拟）

[Python]HTML/XML解析器Beautiful Soup

python标准库Beautiful Soup与MongoDb爬喜马拉雅电台的总结

HDU 4782 Beautiful Soup（模拟）

爬虫学习——网页解析器Beautiful Soup

Beautiful Soup的使用

2017.08.11 Python网络爬虫实战之Beautiful Soup爬虫

Python爬虫利器二之Beautiful Soup的用法

推荐一些python Beautiful Soup学习网址