Estoy haciendo un scrapper/crawler para phpbb, me funciona de lujo en foros sin autentificación, pero cuando lo tengo que usar en un foro que necesita estar autentificado viene el problema.
Intento hacer el login, va bien y me devuelve las cookies las cuales las guardo en la clase, pero luego al ir a otra dirección del sitio, el foro no me ve autentificado y en la petición he enviado las cookies.
Les muestro la clase:
phpbb_scrapper.class.php
Código PHP :
<?php set_time_limit(0); class phpbb_scrapper { // Forum URL where we will scrap the info var $forumURL; // User is logged in? var $isLogged; // List of topic links found var $topicLinkList; // List of the links found var $linkList; // SID var $sid; // Constructor function phpbb_scrapper($forumURL) { // erase http:// if it have and the end slash $f = str_replace('http://','',$forumURL); $f = explode('/',$f); $forumURL = $f[0]; // save the url $this->forumURL = $forumURL; } // Login to phpbb function doLogin($user, $password, $loginURL) { $fp = fsockopen($this->forumURL, 80, $errno, $errstr) or die('Cannot connect to forum: '.$this->forumURL); // Make a sid for phpbb $sid = 0; $dummy = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; for($i=0;$i<31;$i++) $sid .= substr($dummy, (rand()%(strlen($dummy))), 1); $sid = md5($sid); // we save the sid for the future $this->sid = $sid; // The login URL were we will do the login $l = explode($this->forumURL,$loginURL); $l = explode('?',$l[1]); $loginURL = $l[0]; if (!$loginURL) return false; $postdata = "username=$user&password=$password&sid=$sid&login=Login"; // We build the packet to authenticate $packet = "POST $loginURL HTTP/1.1\r\n"; $packet .= "Host: ".$this->forumURL."\r\n"; $packet .= "User-Agent: Mozilla/5.0 (Windows; U; Windows NT 6.0; es-CL; rv:1.9.1.8) Gecko/20100202 Firefox/3.5.8 (.NET CLR 3.5.30729)\r\n"; $packet .= "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n"; $packet .= "Accept-Language: es-cl,es;q=0.8,en-us;q=0.5,en;q=0.3\r\n"; // $packet .= "Accept-Encoding: gzip,deflate\r\n"; $packet .= "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7\r\n"; //$packet .= "Keep-Alive: 300\r\n"; $packet .= "Connection: keep-alive\r\n"; $packet .="Cache-Control: no-cache\r\n"; $packet .= "Referer: http://".$this->forumURL."$loginURL\r\n"; // $packet .= "Cookie: __utma=121132809.1818766607.1266506538.1266506538.1266668142.2; __utmz=121132809.1266506538.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmb=121132809.7.10.1266668142; __utmc=121132809; phpbb3_89k5d_u=1; phpbb3_89k5d_k=; phpbb3_89k5d_sid=$sid\r\n"; $packet .= "Content-Type: application/x-www-form-urlencoded\r\n"; $packet .= "Content-Length: ".strlen($postdata)."\r\n\r\n"; $packet .= "$postdata\r\n\r\n"; print $packet; // Send packet fwrite($fp, $packet, strlen($packet)); // get data while(!feof($fp)) $res.= fread($fp, 1024); fclose($fp); print $res; // parsing the response to get cookies $s = explode("\n",$res); if ($s) { foreach($s as $pos => $v) { $r = array(); ereg("^Set-Cookie: ([^;]+);(.*)$",$v,$r); // we got cookie if ($r[1]) { $cookies[] = trim(str_replace("\r","",str_replace("\n","",$r[1]))); } } } else return false; if ($cookies) { $this->saveCookies($cookies); return true; } else return false; } // search for keywords function search($keyword) { // Replace spaces with + $keyword = str_replace(' ','+',$keyword); $refer = "http://".$this->forumURL."/search.php"; $path = "/search.php?mode=results"; $get = "/search.php?keywords=$keyword&terms=all&author=&sc=1&sf=titleonly&sk=t&sd=d&sr=topics&st=0&ch=0&t=0&submit=Search"; $postdata .= "keywords=$keyword&terms=all&author=&sc=1&sf=titleonly&sk=t&sd=d&sr=topics&st=0&ch=0&t=0&submit=Search\r\n\r\n"; $nextPage=true; $startSearch=true; while($nextPage===true) { $fp = fsockopen($this->forumURL, 80, $errno, $errstr) or die('Cannot connect to forum: '.$this->forumURL); if ($startSearch) $packet = "POST $path HTTP/1.1\r\n"; else $packet = "GET $get HTTP/1.1\r\n"; $packet .= "Host: ".$this->forumURL."\r\n"; $packet .= "User-Agent: Mozilla/5.0 (Windows; U; Windows NT 6.0; es-CL; rv:1.9.1.8) Gecko/20100202 Firefox/3.5.8 (.NET CLR 3.5.30729)\r\n"; $packet .= "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n"; $packet .= "Accept-Language: es-cl,es;q=0.8,en-us;q=0.5,en;q=0.3\r\n"; //$packet .= "Accept-Encoding: gzip,deflate\r\n"; $packet .= "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7\r\n"; //$packet .= "Keep-Alive: 300\r\n"; $packet .= "Connection: keep-alive\r\n"; $packet .="Cache-Control: no-cache\r\n"; $packet .= "Referer: $refer\r\n"; $packet .= "Cookie: ".$this->cookies."\r\n"; if ($startSearch) { $packet .= "Content-Length: ".strlen($postdata)."\r\n\r\n"; $packet .= "$postdata"; } else $packet .= "\r\n"; $startSearch = false; print $packet; fwrite($fp, $packet, strlen($packet)); $res = ""; while(!feof($fp)) $res.=fread($fp,2048); print $res; fclose($fp); // we search for topic with its links $s = explode("\n",$res); foreach ($s as $v) { $t = array(); ereg('<a href="([viewtopic.php]+[^"]+)"([^>]+)>([^<]+)</a>',$v,$t); if ($t[3] && $t[1]) $links[$t[3]] = str_replace('&','&',$t[1]); } $n = array(); ereg('<a href="([^"]+)">Next',$res,$n); if ($n[1]) { $nextPage = true; $refer = $get; $get = str_replace('&','&',$n[1]); } else $nextPage = false; } if (!$links) return false; else { $this->topicLinkList = $links; return true; } } // get the links in all results function getLinksFromSearch($url) { if (!$this->topicLinkList) return false; $url = str_replace('http://','',$url); $total_links = count($this->topicLinkList); print "[$total_links] topics found\n"; $pos = 0; foreach ($this->topicLinkList as $topic => $link) { $pos++; $refer = "http://".$this->forumURL."/search.php"; $get = "$link"; $nextPage=true; $k=1; while($nextPage===true) { print "[$pos/$total_links] [$k] page from [$topic]\n"; $fp = fsockopen($this->forumURL, 80, $errno, $errstr) or die('Cannot connect to forum: '.$this->forumURL); $packet = "GET $get HTTP/1.1\r\n"; $packet .= "Host: ".$this->forumURL."\r\n"; $packet .= "User-Agent: Mozilla/5.0 (Windows; U; Windows NT 6.0; es-CL; rv:1.9.1.8) Gecko/20100202 Firefox/3.5.8 (.NET CLR 3.5.30729)\r\n"; $packet .= "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n"; $packet .= "Accept-Language: es-cl,es;q=0.8,en-us;q=0.5,en;q=0.3\r\n"; //$packet .= "Accept-Encoding: gzip,deflate\r\n"; $packet .= "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7\r\n"; //$packet .= "Keep-Alive: 300\r\n"; $packet .= "Connection: keep-alive\r\n"; $packet .="Cache-Control: no-cache\r\n"; $packet .= "Referer: $refer\r\n"; $packet .= "Cookie: ".$this->cookies."\r\n\r\n"; fwrite($fp, $packet, strlen($packet)); $res = ""; while(!feof($fp)) $res.=fread($fp,2048); fclose($fp); // we search for topic with its links $s = explode("\n",$res); foreach ($s as $v) { $t = array(); if(!ereg('<a href="http://'.$url.'([^"]+)"([^>]+)>([^<]+)</a>',$v,$t)) if(!ereg('<a href="'.$url.'([^"]+)"([^>]+)>([^<]+)</a>',$v,$t)) if(!ereg('>http://'.$url.'([^"]+)"([^<]+)<',$v,$t)) ereg('>'.$url.'([^"]+)"([^<]+)<',$v,$t); if ($t[3] && $t[1]) { print "found link: ".$t[1]."\n"; $links[$t[3]][] = $t[1]; } } $n = array(); ereg('<a href="([^"]+)">Next',$res,$n); if ($n[1]) { $nextPage = true; $refer = $get; $get = str_replace('&','&',$n[1]); $k++; } else $nextPage = false; } } if (!$links) return false; else { $this->linkList = $links; print_r($this->linkList); return true; } } // Export results to CSV function getCsv() { } //------------------------- // // Internal use // //------------------------- function saveCookies($cookies) { foreach($cookies as $data) { if ($cookie) $cookie.= "; $data"; else $cookie= $data; } $this->cookies = $cookie."; __utma=128554762.1179315317.1266717935.1266717935.1266717935.1; __utmb=128554762.4.10.1266717935; __utmc=128554762; __utmz=128554762.1266717935.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)"; } } ?>
Y lo ejecuto desde acá:
test.php
Código PHP :
<?php require_once("phpbb_scrapper.class.php"); $scrap = new phpbb_scrapper('http://www.warez-bb.org'); print "[+] Loggin in...\n"; if ($scrap->doLogin('USUARIO','PASSWORD','http://www.warez-bb.org/login.php?sid=a667eecb5e2f6180f36fbd8ac68ecd42')) print "[+] Logged in\n"; else { print "[X] Login failed\n"; die; } print "[+] Searching...\n"; if ($scrap->search('raduga')) print "[+] Search completed\n"; else { print "[X] No result.\n"; die; } print "[+] Getting links...\n"; if ($scrap->getLinksFromSearch('http://gigapeta.com')) print "[+] Got links!\n"; else print "[X] No links found.\n"; ?>
Al parecer las cookies no las estoy enviando como debe ser, o algún problema en el paquete que envío, pero al comenzar con la búsqueda ya estoy deautentificado. Al hacer el login no tengo problema, el servidor responde que me he autentificado correctamente.
Necesito hacerlo con sockets, no con cURL.
Muchas gracias por su ayuda.
Saludos.