[转]用 C 语言编写一个网络蜘蛛

用 C 语言编写一个网络蜘蛛来搜索网上出现的电子邮件地址

作者：zhoulifa
来源：http://bbs.chinaunix.net/viewthread.php?tid=821361

可能大家经常要去互联网上搜索特定的内容，比如收集大量邮件地址，如果用 google 之类的搜索引擎是没法实现这种特定功能的，所以用 C 语言来写一个吧。它的功能就是不断去取得网络上的页面，然后分析出网页上出现的邮件地址保存下来。象个蜘蛛一样，从网络上一个网页爬向另一个网页，不停止地搜索邮件地址。

即：分析程序运行时的参数，把各网页地址作为根节点加入到链表，然后从链表头开始处理各节点

对整个链表的处理是先处理兄弟节点，流程图如下：

然后再处理各节点的子节点，流程图如下：

当然，这里采用了递归调用方法，处理子节点的数据时和处理整个链表一样循环处理就是了。

/************关于本文档********************************************
*filename: 用 C 语言编写一个网络蜘蛛来搜索网上出现的电子邮件地址
*purpose: 一个邮址搜索程序的雏形
*wrote by: zhoulifa(zhoulifa@163.com) 周立发(http://zhoulifa.bokee.com)
Linux爱好者 Linux知识传播者 SOHO族开发者最擅长C语言
*date time:2006-08-31 21:00:00
*Note: 任何人可以任意复制代码并运用这些文档，当然包括你的商业用途
* 但请遵循GPL
*Hope:希望越来越多的人贡献自己的力量，为科学技术发展出力
*********************************************************************/

程序在运行的过程中要建立一个树形链表结构，结构图如下：

程序启动时分析所带参数，把各参数加入到根网页节点，如果有多个参数则这个根网页有兄弟节点。
然后从根节点开始处理这一级上各节点，把各节点网页上出现的网页链接加到该节点的子节点上，处理完当前这一级后处理子节点这一级。

当然这只是一个原理展示程序，并没有进行优化。

这个程序的 main 函数流程图如下：

源代码如下：

#include < sys / types.h >
#include < sys / stat.h >
#include < fcntl.h >
#include < sys / mman.h >
#include < unistd.h >
#include < stdio.h >
#include < string .h >
#include < stdlib.h >
#include < netdb.h >
#include < errno.h >
#include < locale.h >

#define USERAGENT "Wget/1.10.2"
#define ACCEPT "*/*"
#define ACCEPTLANGUAGE "zh-cn,zh;q=0.5"
#define ACCEPTENCODING "gzip,deflate"
#define ACCEPTCHARSET "gb2312,utf-8;q=0.7,*;q=0.7"
#define KEEPALIVE "300"
#define CONNECTION "keep-alive"
#define CONTENTTYPE "application/x-www-form-urlencoded"

#define MAXFILENAME 14
#define DEBUG 1

typedef struct webnode {
         char * host;                  /* 网页所在的主机 */
         int port;                     /* 网络服务器所使用的端口 */
         char * dir;                   /* 网页所在的目录 */
         char * page;                  /* 网页文件名 */
         char * file;                  /* 本地保存的文件名 */
         char IsHandled;               /* 是否处理过 */
         struct webnode * brother;     /* 兄弟节点链表指针 */
         struct webnode * child;       /* 子节点链表指针 */
} WEBNODE;

struct sockaddr_in server_addr;
int sockfd = 0 , dsend = 0 , totalsend = 0 , nbytes = 0 , reqn = 0 , i = 0 , j = 0 , ret = 0 ;
struct hostent * host;
char request[ 409600 ] = "" , buffer[ 1024 ] = "" , httpheader[ 1024 ] = "" ;
int FileNumber = 0 ;
char e[ 2 ] = " @/ " ;
WEBNODE * NodeHeader, * NodeTail, * NodeCurr;
char * mapped_mem;

int GetHost( char * , char ** , char ** , int * , char ** ); /**/
void AnalyzePage(WEBNODE * ); /**/
void AddInitNode( char * , char * , int , char * ); /**/
void HandleInitNode(WEBNODE * ); /**/
void DisplayNode(WEBNODE * ); /**/
void HandOneNode(WEBNODE * ); /**/
void DoneWithList( int ); /**/
void DoOnce(); /**/
void ConnectWeb( void ); /**/
void SendRequest( void ); /**/
void ReceiveResponse( void ); /**/
void GetEmail( char * ); /**/
void GetLink( char * ); /**/
void GetBeforePos( char * , char ** ); /**/
void GetAfterPos( char * , char ** ); /**/
void AddChildNode(WEBNODE * , char * ); /**/
void GetAfterPosWithSlash( char * , char ** ); /**/
void GetMemory( char ** , int ); /**/
int IsExistWeb(WEBNODE * , char * , char * , int , char * ); /**/
void Rstrchr( char * , int , char ** ); /**/
int GetLocalAgent( char * UserAgent, char * Accept, char * AcceptLanguage, char * AcceptEncoding, char * AcceptCharset, char * KeepAlive, char * Connection, char * ContentType); /**/

/* *************************************************************
功能：设置 HTTP 协议头内容的一些固定值
************************************************************** */
int GetLocalAgent( char * UserAgent, char * Accept, char * AcceptLanguage, char * AcceptEncoding, char * AcceptCharset, char * KeepAlive, char * Connection, char * ContentType)
{
  memcpy(UserAgent, USERAGENT, strlen(USERAGENT));
  memcpy(Accept, ACCEPT, strlen(ACCEPT));
  memcpy(AcceptLanguage, ACCEPTLANGUAGE, strlen(ACCEPTLANGUAGE));
  memcpy(AcceptEncoding, ACCEPTENCODING, strlen(ACCEPTENCODING));
  memcpy(AcceptCharset, ACCEPTCHARSET, strlen(ACCEPTCHARSET));
  memcpy(KeepAlive, KEEPALIVE, strlen(KEEPALIVE));
  memcpy(Connection, CONNECTION, strlen(CONNECTION));
  memcpy(ContentType, CONTENTTYPE, strlen(CONTENTTYPE));
   return 0 ;
}

/* *************************************************************
功能：在字符串 s 里搜索 x 字符，并设置指针 d 指向该位置
************************************************************** */
void Rstrchr( char * s, int x, char ** d)
{
         int len = strlen(s) - 1 ;
         while (len >= 0 )        {
                 if (x == s[len]) {( * d) = s + len; return ;}
                len -- ;
        }
        ( * d) = 0 ;
}

/* *************************************************************
功能：连接一个网站服务器
************************************************************** */
void ConnectWeb( void ) { /* connect to web server */
   /* create a socket descriptor */
   if ((sockfd = socket(PF_INET,SOCK_STREAM, 0 )) ==- 1 )
  {
    fprintf(stderr, " Socket Error:%sa " ,strerror(errno));
    exit( 1 );
  }

   /* bind address */
  bzero( & server_addr, sizeof (server_addr));
  server_addr.sin_family = AF_INET;
  server_addr.sin_port = htons(NodeCurr -> port);
  server_addr.sin_addr = * (( struct in_addr * )host -> h_addr);

   /* connect to the server */
   if (connect(sockfd, ( struct sockaddr * )( & server_addr), sizeof ( struct sockaddr)) == - 1 )
  {
    fprintf(stderr, " Connect Error:%sa " , strerror(errno));
    exit( 1 );
  }
}

/* *************************************************************
功能：向网站发送 HTTP 请求
************************************************************** */
void SendRequest( void ) { /* send my http-request to web server */
  dsend = 0 ;totalsend = 0 ;
  nbytes = strlen(request);
   while (totalsend < nbytes) {
    dsend = write(sockfd, request + totalsend, nbytes - totalsend);
     if (dsend ==- 1 )  {fprintf(stderr, " send error!%s " , strerror(errno));exit( 0 );}
    totalsend += dsend;
    fprintf(stdout, " Request.%d %d bytes send OK! " , reqn, totalsend);
  }
}

/* *************************************************************
功能：接收网站的 HTTP 返回
************************************************************** */
void ReceiveResponse( void ) { /* get response from web server */
  fd_set writefds;
   struct timeval tival;
   int retry = 0 ;
  FILE * localfp = NULL;

  i = 0 ; j = 0 ;
__ReCeive:
  FD_ZERO( & writefds);
  tival.tv_sec = 10 ;
  tival.tv_usec = 0 ;
   if (sockfd > 0 ) FD_SET(sockfd, & writefds);
   else {fprintf(stderr, " Error, socket is negative! " ); exit( 0 );}

  ret = select(sockfd + 1 , & writefds, NULL, NULL, & tival);
   if (ret == 0 ) {
     if (retry ++ < 10 ) goto __ReCeive;
  }
   if (ret <= 0 ) {fprintf(stderr, " Error while receiving! " ); exit( 0 );}

   if (FD_ISSET(sockfd, & writefds))    {
    memset(buffer, 0 , 1024 );
    memset(httpheader, 0 , 1024 );
     if ((localfp = fopen(NodeCurr -> file, " w " )) == NULL) { if (DEBUG) fprintf(stderr, " create file '%s' error " , NodeCurr -> file); return ;}
     /* receive data from web server */
     while ((nbytes = read(sockfd,buffer, 1 )) == 1 )
    {
       if (i < 4 )  { /* 获取 HTTP 消息头 */
         if (buffer[ 0 ] == ' ' || buffer[ 0 ] == ' ' )  i ++ ;
         else i = 0 ;
        memcpy(httpheader + j, buffer, 1 ); j ++ ;
      }
       else   { /* 获取 HTTP 消息体 */
        fprintf(localfp, " %c " , buffer[ 0 ]); /* print content on the screen */
         // fprintf(stdout, "%c", buffer[0]); /* print content on the screen */
        i ++ ;
      }
    }
    fclose(localfp);
  }
}

/* *************************************************************
功能：执行一次 HTTP 请求
************************************************************** */
void DoOnce() { /* send and receive */
  ConnectWeb(); /* connect to the web server */

   /* send a request */
  SendRequest();

   /* receive a response message from web server */
  ReceiveResponse();

  close(sockfd); /* because HTTP protocol do something one connection, so I can close it after receiving */
}

/* *************************************************************
功能：执行 HTTP 请求
************************************************************** */
void DoneWithList( int flag) {
   if (flag) fprintf(stdout, " Request.%d is: %s " , ++ reqn, request);

  DoOnce();

   if (flag) fprintf(stdout, " The following is the response header: %s " , httpheader);
}

/* *************************************************************
功能：从字符串 src 中分析出网站地址和端口，并得到文件和目录
************************************************************** */
int GetHost( char * src, char ** web, char ** file, int * port, char ** dir)  {
   char * pA, * pB, * pC;
   int len;

   * port = 0 ;
   if ( ! ( * src))   return - 1 ;
  pA = src;
   if ( ! strncmp(pA, " http:// " , strlen( " http:// " )))  pA = src + strlen( " http:// " );
   /* else if(!strncmp(pA, "https://", strlen("https://")))  pA = src+strlen("https://"); */
   else return 1 ;
  pB = strchr(pA, ' / ' );
   if (pB)  {
    len = strlen(pA) - strlen(pB);
    GetMemory(web, len);
    memcpy(( * web), pA, len);
     if ( * (pB + 1 ))  {
      Rstrchr(pB + 1 , ' / ' , & pC);
       if (pC) len = strlen(pB + 1 ) - strlen(pC);
       else len = 0 ;
       if (len > 0 ) {
        GetMemory(dir, len);
        memcpy(( * dir), pB + 1 , len);

         if (pC + 1 ) {
          len = strlen(pC + 1 );
          GetMemory(file, len);
          memcpy(( * file), pC + 1 , len);
        }
         else {
          len = 1 ;
          GetMemory(file, len);
          memcpy(( * file), e, len);
        }
      }
       else {
        len = 1 ;
        GetMemory(dir, len);
        memcpy(( * dir), e + 1 , len);

        len = strlen(pB + 1 );
        GetMemory(file, len);
        memcpy(( * file), pB + 1 , len);
      }
    }
     else {
      len = 1 ;
      GetMemory(dir, len);
      memcpy(( * dir), e + 1 , len);

      len = 1 ;
      GetMemory(file, len);
      memcpy(( * file), e, len);
    }
  }
   else   {
    len = strlen(pA);
    GetMemory(web, len);
    memcpy(( * web), pA, strlen(pA));
    len = 1 ;
    GetMemory(dir, len);
    memcpy(( * dir), e + 1 , len);
    len = 1 ;
    GetMemory(file, len);
    memcpy(( * file), e, len);
  }

  pA = strchr(( * web), ' : ' );
   if (pA)   * port = atoi(pA + 1 );
   else * port = 80 ;

   return 0 ;
}

/* ********************************************************************
*filename: mailaddrsearch.c
*purpose: 用 C 语言编写一个网络蜘蛛来搜索网上出现的电子邮件地址
*tidied by: zhoulifa(zhoulifa@163.com) 周立发( http://zhoulifa.bokee.com )
Linux爱好者 Linux知识传播者 SOHO族开发者最擅长C语言
*date time:2006-08-31 21:00:00
*Note: 任何人可以任意复制代码并运用这些文档，当然包括你的商业用途
* 但请遵循GPL
*Thanks to: www.gd-linux.org 广东省 Linux 公共服务技术支持中心
******************************************************************** */

int main( int argc, char ** argv)
{
         int WebPort;
         char * WebHost = 0 , * PageAddress = 0 , * WebDir = 0 ;

         if (argc < 2 ) { if (DEBUG) fprintf(stdout, " Command error, you should input like this: %s WebPageAddress1 WebPageAddress2 WebPageAddress3 ... " , argv[ 0 ]); exit( 0 );}

        NodeHeader = NodeTail = NodeCurr = 0 ;
         // setlocale(LC_ALL, "zh_CN.gb2312");
         for (i = 1 ; i < argc; i ++ )        {
                ret = GetHost(argv, & WebHost, & PageAddress, & WebPort, & WebDir); /* Get web page info */
                 if (ret)        { if (DEBUG) fprintf(stdout, " GetHost error from '%s' " , argv); exit( 0 );}
                AddInitNode(WebHost, PageAddress, WebPort, WebDir); /* add this page to chain */
        }
        free(WebHost); free(PageAddress);free(WebDir);
         if (DEBUG)        {
                fprintf(stdout, " Display.%5d: " , FileNumber);
                DisplayNode(NodeHeader); /* display every node */
        }
        HandleInitNode(NodeHeader); /* handle every page */
         return 0 ;
}

/* *************************************************************
功能：分析网页
************************************************************** */
void AnalyzePage(WEBNODE * node)
{
         int fd;
         int flength = 0 ;
        fd = open(node -> file, O_RDONLY);
         if (fd == - 1 )         goto __AnalyzeDone;
        flength = lseek(fd, 1 , SEEK_END);
        write(fd, "/ 0 " , 1 );
        lseek(fd, 0 , SEEK_SET);
        mapped_mem = mmap( 0 , flength, PROT_READ, MAP_PRIVATE, fd, 0 );
        GetEmail(mapped_mem);
        GetLink(mapped_mem);
        close(fd);
        munmap(mapped_mem, flength);
__AnalyzeDone:
        close(fd);
        node -> IsHandled = 1 ;
        remove(node -> file);
}

/* *************************************************************
功能：为根节点设置兄弟节点
************************************************************** */
void AddInitNode( char * Host, char * Page, int Port, char * Dir)
{
        WEBNODE * NewNode;
         char filename[MAXFILENAME + 1 ] = "" ;

         if (NodeHeader == NULL) NewNode = NodeHeader = (WEBNODE * )malloc( sizeof (WEBNODE));
         else NodeTail -> brother = NewNode = (WEBNODE * )malloc( sizeof (WEBNODE));
        memset(NewNode, 0 , sizeof (WEBNODE));
        NewNode -> host = ( char * )malloc(strlen(Host) + 1 );
        memset(NewNode -> host, 0 , strlen(Host) + 1 );
        NewNode -> page = ( char * )malloc(strlen(Page) + 1 );
        memset(NewNode -> page, 0 , strlen(Page) + 1 );
        NewNode -> dir = ( char * )malloc(strlen(Dir) + 1 );
        memset(NewNode -> dir, 0 , strlen(Dir) + 1 );
        NewNode -> file = ( char * )malloc(MAXFILENAME + 1 );
        memset(NewNode -> file, 0 , MAXFILENAME + 1 );
        strcpy(NewNode -> host, Host);
        strcpy(NewNode -> page, Page);
        strcpy(NewNode -> dir, Dir);
        sprintf(filename, " file%05d.html " , FileNumber ++ );
        strcpy(NewNode -> file, filename);
        NewNode -> port = Port;
        NewNode -> IsHandled = 0 ;
        NewNode -> brother = 0 ;
        NewNode -> child = 0 ;
        NodeTail = NewNode;
}

/* *************************************************************
功能：处理根节点信息
************************************************************** */
void HandleInitNode(WEBNODE * node)
{
        WEBNODE * CurrentNode = 0 ;
        CurrentNode = node;
         if (CurrentNode)        {
                 while (CurrentNode)        {
                         if (CurrentNode -> IsHandled == 0 )        {
                                HandOneNode(CurrentNode);
                                 if (DEBUG)        {
                                        fprintf(stdout, " Display.%5d: " , FileNumber);
                                        DisplayNode(NodeHeader); /* display every node */
                                }
                        }
                        CurrentNode = CurrentNode -> brother;
                }
                CurrentNode = node;
                 while (CurrentNode)        {
                         if (CurrentNode -> child && CurrentNode -> child -> IsHandled == 0 )        {
                                HandleInitNode(CurrentNode -> child);
                        }
                        CurrentNode = CurrentNode -> brother;
                }
        }
}

/* *************************************************************
功能：显示年有节点信息
************************************************************** */
void DisplayNode(WEBNODE * NodeHeader)
{
        WEBNODE * TempNode;
        TempNode = NodeHeader;
        fprintf(stdout, " " );
         while (TempNode) {
                 if ( ! strcmp(TempNode -> dir, " / " ))        fprintf(stdout, " %s:%d%s%s => %s %d " , TempNode -> host, TempNode -> port, TempNode -> dir, strcmp(TempNode -> page, " @ " ) ? TempNode -> page: "" , TempNode -> file, TempNode -> IsHandled);
                 else         fprintf(stdout, " %s:%d/%s/%s => %s %d " , TempNode -> host, TempNode -> port, TempNode -> dir, strcmp(TempNode -> page, " @ " ) ? TempNode -> page: "" , TempNode -> file, TempNode -> IsHandled);
                TempNode = TempNode -> brother;
        }
        TempNode = NodeHeader;
         while (TempNode) {
                 if (TempNode -> child)        DisplayNode(TempNode -> child);
                TempNode = TempNode -> brother;
        }
}

/* *************************************************************
功能：处理单个节点信息
************************************************************** */
void HandOneNode(WEBNODE * node)
{
         char UserAgent[ 1024 ] = "" , Accept[ 1024 ] = "" , AcceptLanguage[ 1024 ] = "" , AcceptEncoding[ 1024 ] = "" , AcceptCharset[ 1024 ] = "" , KeepAlive[ 1024 ] = "" , Connection[ 1024 ] = "" , ContentType[ 1024 ] = "" ;

        NodeCurr = node;
         if ((host = gethostbyname(NodeCurr -> host)) == NULL) /* get ip address by domain */
        {
                 if (DEBUG)  fprintf(stderr, " Gethostname '%s' error, %s " , NodeCurr -> host, strerror(errno));
                exit( 1 );
        }
        GetLocalAgent(UserAgent, Accept, AcceptLanguage, AcceptEncoding, AcceptCharset, KeepAlive, Connection, ContentType); /* Get client browser information */

         if (strcmp(NodeCurr -> dir, " / " ))        sprintf(request, " GET /%s/%s HTTP/1.0 Host: %s User-Agent: %s Accept: %s Connection: %s " , NodeCurr -> dir, strcmp(NodeCurr -> page, " @ " ) ? NodeCurr -> page: "" , NodeCurr -> host, UserAgent, Accept, Connection);
         else         sprintf(request, " GET %s%s HTTP/1.0 Host: %s User-Agent: %s Accept: %s Connection: %s " , NodeCurr -> dir, strcmp(NodeCurr -> page, " @ " ) ? NodeCurr -> page: "" , NodeCurr -> host, UserAgent, Accept, Connection);
        DoneWithList( 1 );
        AnalyzePage(NodeCurr);
}

/* *************************************************************
功能：从字符串 src 中分析出邮件地址保存到文件
************************************************************** */
void GetEmail( char * src)
{
         char * pa, * pb, * pc, * pd;
         char myemail[ 1024 ] = "" ;
        FILE * mailfp = NULL;
         if ((mailfp = fopen( " email.txt " , " a+ " )) == NULL)         return ;
        pa = src;
         while ((pb = strchr(pa, ' @ ' )))        {
                GetBeforePos(pb, & pc);
                GetAfterPos(pb, & pd);
                 if (pc && pd && (strlen(pc) > (strlen(pd) + 3 )))        {
                        memset(myemail, 0 , 1024 );
                        memcpy(myemail, pc, strlen(pc) - strlen(pd));
                         if (strcmp(NodeCurr -> dir, " / " )) fprintf(mailfp, " %s http://%s/%s/%s " , myemail, NodeCurr -> host, NodeCurr -> dir, strcmp(NodeCurr -> page, " @ " ) ? NodeCurr -> page: "" );
                         else   fprintf(mailfp, " %s http://%s%s%s " , myemail, NodeCurr -> host, NodeCurr -> dir, strcmp(NodeCurr -> page, " @ " ) ? NodeCurr -> page: "" );
                         if ( * (pd + 1 ))        pa = pd + 1 ;
                         else break ;
                }
                 else if ( * (pb + 1 ))        pa = pb + 1 ;
                 else          break ;
        }
        fclose(mailfp);
}

/* *************************************************************
功能：从 src 中找出前面的字母、数字等内含，即 email 地址中 @ 的前面部分
************************************************************** */
void GetBeforePos( char * src, char ** d)
{
         char * x;
         if (src - 1 )        x = src - 1 ;
         else { * d = 0 ; return ;}
         while (x)        {
                 if ( * x >= ' a ' && * x <= ' z ' ) {x -- ; continue ;}
                 else if ( * x >= ' A ' && * x <= ' Z ' ) {x -- ; continue ;}
                 else if ( * x >= ' 0 ' && * x <= ' 9 ' ) {x -- ; continue ;}
                 else if ( * x == ' . ' || * x == ' - ' || * x == ' _ ' ) {x -- ; continue ;}
                 else { break ;}
        }
        x ++ ;
         if (x) * d = x;
         else * d = 0 ;
}

/* *************************************************************
功能：从 src 中找出后面的字母、数字等内含，即 email 地址中 @ 的后面部分
************************************************************** */
void GetAfterPos( char * src, char ** d)
{
         char * x;
         if (src + 1 )        x = src + 1 ;
         else { * d = 0 ; return ;}
         while (x)        {
                 if ( * x >= ' a ' && * x <= ' z ' ) {x ++ ; continue ;}
                 else if ( * x >= ' A ' && * x <= ' Z ' ) {x ++ ; continue ;}
                 else if ( * x >= ' 0 ' && * x <= ' 9 ' ) {x ++ ; continue ;}
                 else if ( * x == ' . ' || * x == ' - ' || * x == ' _ ' ) {x ++ ; continue ;}
                 else { break ;}
        }
         if (x) * d = x;
         else * d = 0 ;
}

/* *************************************************************
功能：从 src 中找出前面的字母、数字等内含，即一个网页地址中主机名后面的部分
************************************************************** */
void GetAfterPosWithSlash( char * src, char ** d)
{
         char * x;
         if (src)        x = src;
         else { * d = 0 ; return ;}
         while (x)        {
                 if ( * x >= ' a ' && * x <= ' z ' ) {x ++ ; continue ;}
                 else if ( * x >= ' A ' && * x <= ' Z ' ) {x ++ ; continue ;}
                 else if ( * x >= ' 0 ' && * x <= ' 9 ' ) {x ++ ; continue ;}
                 else if ( * x == ' . ' || * x == ' - ' || * x == ' _ ' || * x == ' = ' ) {x ++ ; continue ;}
                 else if ( * x == ' : ' || * x == ' / ' || * x == ' ? ' || * x == ' & ' ) {x ++ ; continue ;}
                 else { break ;}
        }
         if (x) * d = x;
         else * d = 0 ;
}

/* *************************************************************
功能：为 myanchor 分配 len 大小的内存
************************************************************** */
void GetMemory( char ** myanchor, int len)
{
         if ( ! ( * myanchor))        ( * myanchor) = ( char * )malloc(len + 1 );
         else         ( * myanchor) = ( char * )realloc(( void * )( * myanchor), len + 1 );
        memset(( * myanchor), 0 , len + 1 );
}

/* *************************************************************
功能：从 src 中分析出网页链接，并加入到当前节点的子节点上
************************************************************** */
void GetLink( char * src)
{
         char * pa, * pb, * pc;
         char * myanchor = 0 ;
         int len = 0 ;

        pa = src;
         do {
                 if ((pb = strstr(pa, " href=' " )))        {
                        pc = strchr(pb + 6 , ' ' ' );
                        len = strlen(pb + 6 ) - strlen(pc);
                        GetMemory( & myanchor, len);
                        memcpy(myanchor, pb + 6 , len);
                }
                 else if ((pb = strstr(pa, " href=" " )))        {
                        pc = strchr(pb + 6 , ' " ' );
                        len = strlen(pb + 6 ) - strlen(pc);
                        GetMemory( & myanchor, len);
                        memcpy(myanchor, pb + 6 , len);
                }
                 else if ((pb = strstr(pa, " href= " )))        {
                        GetAfterPosWithSlash(pb + 5 , & pc);
                        len = strlen(pb + 5 ) - strlen(pc);
                        GetMemory( & myanchor, len);
                        memcpy(myanchor, pb + 5 , len);
                }
                 else { goto __returnLink ;}
/*
                if(DEBUG)        {
                        if(strcmp(NodeCurr->dir, "/"))        fprintf(stdout, "%s http://%s/%s/%s ", myanchor, NodeCurr->host, NodeCurr->dir, strcmp(NodeCurr->page, "`")?NodeCurr->page:"");
                        else        fprintf(stdout, "%s http://%s%s%s ", myanchor, NodeCurr->host, NodeCurr->dir, strcmp(NodeCurr->page, "`")?NodeCurr->page:"");
                }
*/
                 if (strlen(myanchor) > 0 )        AddChildNode(NodeCurr, myanchor);
                 if (pc + 1 )        pa = pc + 1 ;
        } while (pa);
__returnLink:
         return ;
}

/* *************************************************************
功能：为当前节点增加子节点
************************************************************** */
void AddChildNode(WEBNODE * node, char * src)
{
         int WebPort, len;
         char * WebHost = 0 , * PageAddress = 0 , * WebDir = 0 , * pC = 0 ;
        WEBNODE * NewNode;
         char filename[MAXFILENAME + 1 ] = "" ;
         char IsFromRoot = 0 ;

         if ( ! src)         return ;
         if ( ! strncasecmp(src, " mailto: " , strlen( " mailto: " )))         return ;
         if (strstr(src, " .css " ))         return ;
         if (strstr(src, " .xml " ))         return ;
         if (strstr(src, " .ico " ))         return ;
         if (strstr(src, " .jpg " ))         return ;
         if (strstr(src, " .gif " ))         return ;
         if (strstr(src, " javascript: " ))         return ;
         if (strstr(src, " + " ))         return ;

        ret = GetHost(src, & WebHost, & PageAddress, & WebPort, & WebDir);
         if (ret)        {
                len = strlen(node -> host);
                GetMemory( & WebHost, len);
                strcpy(WebHost, node -> host);

                WebPort = node -> port;

                IsFromRoot = ! strncmp(src, " / " , 1 );
                 if (IsFromRoot && (src + 1 ))        Rstrchr(src + 1 , ' / ' , & pC);
                 else if ( ! IsFromRoot)        Rstrchr(src, ' / ' , & pC);
                 else         pC = 0 ;

                 if (pC)        {
                         if (IsFromRoot)        len = strlen(src + 1 ) - strlen(pC);
                         else         len = strlen(src) - strlen(pC) + strlen(node -> dir) + 1 ;
                        GetMemory( & WebDir, len);
                         if (IsFromRoot)        memcpy(WebDir, src + 1 , len);
                         else         {memcpy(WebDir, node -> dir, strlen(node -> dir)); strcat(WebDir, " / " ); memcpy(WebDir + strlen(node -> dir) + 1 , src, strlen(src) - strlen(pC));}

                         if (pC + 1 )        {
                                len = strlen(pC + 1 );
                                GetMemory( & PageAddress, len);
                                strcpy(PageAddress, pC + 1 );
                        }
                         else         {
                                len = 1 ;
                                GetMemory( & PageAddress, len);
                                memcpy(PageAddress, e, len);
                        }
                }
                 else         {
                         if (IsFromRoot)        {
                                len = 1 ;
                                GetMemory( & WebDir, len);
                                memcpy(WebDir, e + 1 , len);

                                len = strlen(src + 1 );
                                GetMemory( & PageAddress, len);
                                memcpy(PageAddress, src + 1 , len);
                        }
                         else         {
                                len = strlen(node -> dir);
                                GetMemory( & WebDir, len);
                                memcpy(WebDir, node -> dir, len);

                                len = strlen(src);
                                GetMemory( & PageAddress, len);
                                memcpy(PageAddress, src, len);
                        }
                }
        }
        ret = IsExistWeb(NodeHeader, WebHost, PageAddress, WebPort, WebDir);
         if (ret) goto __ReturnAdd;

         if (node -> child == NULL)        NewNode = node -> child = (WEBNODE * )malloc( sizeof (WEBNODE));
         else NodeTail -> brother = NewNode = (WEBNODE * )malloc( sizeof (WEBNODE));
        memset(NewNode, 0 , sizeof (WEBNODE));
        NewNode -> host = ( char * )malloc(strlen(WebHost) + 1 );
        memset(NewNode -> host, 0 , strlen(WebHost) + 1 );
        NewNode -> page = ( char * )malloc(strlen(PageAddress) + 1 );
        memset(NewNode -> page, 0 , strlen(PageAddress) + 1 );
        NewNode -> dir = ( char * )malloc(strlen(WebDir) + 1 );
        memset(NewNode -> dir, 0 , strlen(WebDir) + 1 );
        NewNode -> file = ( char * )malloc(MAXFILENAME + 1 );
        memset(NewNode -> file, 0 , MAXFILENAME + 1 );
        strcpy(NewNode -> host, WebHost);
        strcpy(NewNode -> page, PageAddress);
        strcpy(NewNode -> dir, WebDir);
        sprintf(filename, " file%05d.html " , FileNumber ++ );
        strcpy(NewNode -> file, filename);
        NewNode -> port = WebPort;
        NewNode -> IsHandled = 0 ;
        NewNode -> brother = 0 ;
        NewNode -> child = 0 ;
        NodeTail = NewNode;
__ReturnAdd:
        free(WebHost); free(PageAddress); free(WebDir);
}

/* *************************************************************
功能：检查是否已经处理过的网页
************************************************************** */
int IsExistWeb(WEBNODE * node, char * host, char * page, int port, char * dir)
{
        WEBNODE * t;
        t = node;
         while (t)        {
                 if ( ! strcmp(t -> host, host) && ! strcmp(t -> page, page) && t -> port == port && ! strcmp(t -> dir, dir)) return 1 ;
                t = t -> brother;
        }
        t = node;
         while (t)        {
                 if (t -> child)        {
                        ret = IsExistWeb(t -> child, host, page, port, dir);
                         if (ret)         return 2 ;
                }
                t = t -> brother;
        }
         return 0 ;
}

编译这个程序：

gcc mailaddrsearch.c -o mailsearcher

输入一个网址作为参数运行一下试试吧：

./mailsearcher http://zhoulifa.bokee.com/ 5531748 .html

程序首先找出 http://zhoulifa.bokee.com/5531748.html 页面上的邮件地址保存到当前目录下 email.txt 文件里，每行一条记录，格式为邮件地址和出现该邮件地址的网页。然后分析这个页面上出现的网页链接，把各链接作为子节点加入链表，再去处理子节点，重复上述操作。

这只是一个示例程序，并不完善，如果要使其达到实用的目的，还需要让这个程序效率更高点，比如加入 epoll （在 2.4 内核中只有 select 了）实现 I/O 多路复用。又比如对每个子节点实现多线程，每个线程处理一个节点。
如果对 I/O 多路复用不熟悉，您可以看一下我这篇文章 http://zhoulifa.bokee.com/5345930.html 里关于 “ Linux 下各类TCP网络服务器的实现源代码”