Estoy haciendo un scrapper/crawler para phpbb, me funciona de lujo en foros sin autentificación, pero cuando lo tengo que usar en un foro que necesita estar autentificado viene el problema.
Intento hacer el login, va bien y me devuelve las cookies las cuales las guardo en la clase, pero luego al ir a otra dirección del sitio, el foro no me ve autentificado y en la petición he enviado las cookies.
Les muestro la clase:
phpbb_scrapper.class.php
Código PHP :
<?php
set_time_limit(0);
class phpbb_scrapper
{
// Forum URL where we will scrap the info
var $forumURL;
// User is logged in?
var $isLogged;
// List of topic links found
var $topicLinkList;
// List of the links found
var $linkList;
// SID
var $sid;
// Constructor
function phpbb_scrapper($forumURL)
{
// erase http:// if it have and the end slash
$f = str_replace('http://','',$forumURL);
$f = explode('/',$f);
$forumURL = $f[0];
// save the url
$this->forumURL = $forumURL;
}
// Login to phpbb
function doLogin($user, $password, $loginURL)
{
$fp = fsockopen($this->forumURL, 80, $errno, $errstr) or die('Cannot connect to forum: '.$this->forumURL);
// Make a sid for phpbb
$sid = 0;
$dummy = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
for($i=0;$i<31;$i++)
$sid .= substr($dummy, (rand()%(strlen($dummy))), 1);
$sid = md5($sid);
// we save the sid for the future
$this->sid = $sid;
// The login URL were we will do the login
$l = explode($this->forumURL,$loginURL);
$l = explode('?',$l[1]);
$loginURL = $l[0];
if (!$loginURL)
return false;
$postdata = "username=$user&password=$password&sid=$sid&login=Login";
// We build the packet to authenticate
$packet = "POST $loginURL HTTP/1.1\r\n";
$packet .= "Host: ".$this->forumURL."\r\n";
$packet .= "User-Agent: Mozilla/5.0 (Windows; U; Windows NT 6.0; es-CL; rv:1.9.1.8) Gecko/20100202 Firefox/3.5.8 (.NET CLR 3.5.30729)\r\n";
$packet .= "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n";
$packet .= "Accept-Language: es-cl,es;q=0.8,en-us;q=0.5,en;q=0.3\r\n";
// $packet .= "Accept-Encoding: gzip,deflate\r\n";
$packet .= "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7\r\n";
//$packet .= "Keep-Alive: 300\r\n";
$packet .= "Connection: keep-alive\r\n";
$packet .="Cache-Control: no-cache\r\n";
$packet .= "Referer: http://".$this->forumURL."$loginURL\r\n";
// $packet .= "Cookie: __utma=121132809.1818766607.1266506538.1266506538.1266668142.2; __utmz=121132809.1266506538.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmb=121132809.7.10.1266668142; __utmc=121132809; phpbb3_89k5d_u=1; phpbb3_89k5d_k=; phpbb3_89k5d_sid=$sid\r\n";
$packet .= "Content-Type: application/x-www-form-urlencoded\r\n";
$packet .= "Content-Length: ".strlen($postdata)."\r\n\r\n";
$packet .= "$postdata\r\n\r\n";
print $packet;
// Send packet
fwrite($fp, $packet, strlen($packet));
// get data
while(!feof($fp))
$res.= fread($fp, 1024);
fclose($fp);
print $res;
// parsing the response to get cookies
$s = explode("\n",$res);
if ($s)
{
foreach($s as $pos => $v)
{
$r = array();
ereg("^Set-Cookie: ([^;]+);(.*)$",$v,$r);
// we got cookie
if ($r[1])
{
$cookies[] = trim(str_replace("\r","",str_replace("\n","",$r[1])));
}
}
}
else
return false;
if ($cookies)
{
$this->saveCookies($cookies);
return true;
}
else
return false;
}
// search for keywords
function search($keyword)
{
// Replace spaces with +
$keyword = str_replace(' ','+',$keyword);
$refer = "http://".$this->forumURL."/search.php";
$path = "/search.php?mode=results";
$get = "/search.php?keywords=$keyword&terms=all&author=&sc=1&sf=titleonly&sk=t&sd=d&sr=topics&st=0&ch=0&t=0&submit=Search";
$postdata .= "keywords=$keyword&terms=all&author=&sc=1&sf=titleonly&sk=t&sd=d&sr=topics&st=0&ch=0&t=0&submit=Search\r\n\r\n";
$nextPage=true;
$startSearch=true;
while($nextPage===true)
{
$fp = fsockopen($this->forumURL, 80, $errno, $errstr) or die('Cannot connect to forum: '.$this->forumURL);
if ($startSearch)
$packet = "POST $path HTTP/1.1\r\n";
else
$packet = "GET $get HTTP/1.1\r\n";
$packet .= "Host: ".$this->forumURL."\r\n";
$packet .= "User-Agent: Mozilla/5.0 (Windows; U; Windows NT 6.0; es-CL; rv:1.9.1.8) Gecko/20100202 Firefox/3.5.8 (.NET CLR 3.5.30729)\r\n";
$packet .= "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n";
$packet .= "Accept-Language: es-cl,es;q=0.8,en-us;q=0.5,en;q=0.3\r\n";
//$packet .= "Accept-Encoding: gzip,deflate\r\n";
$packet .= "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7\r\n";
//$packet .= "Keep-Alive: 300\r\n";
$packet .= "Connection: keep-alive\r\n";
$packet .="Cache-Control: no-cache\r\n";
$packet .= "Referer: $refer\r\n";
$packet .= "Cookie: ".$this->cookies."\r\n";
if ($startSearch)
{
$packet .= "Content-Length: ".strlen($postdata)."\r\n\r\n";
$packet .= "$postdata";
}
else
$packet .= "\r\n";
$startSearch = false;
print $packet;
fwrite($fp, $packet, strlen($packet));
$res = "";
while(!feof($fp))
$res.=fread($fp,2048);
print $res;
fclose($fp);
// we search for topic with its links
$s = explode("\n",$res);
foreach ($s as $v)
{
$t = array();
ereg('<a href="([viewtopic.php]+[^"]+)"([^>]+)>([^<]+)</a>',$v,$t);
if ($t[3] && $t[1])
$links[$t[3]] = str_replace('&','&',$t[1]);
}
$n = array();
ereg('<a href="([^"]+)">Next',$res,$n);
if ($n[1])
{
$nextPage = true;
$refer = $get;
$get = str_replace('&','&',$n[1]);
}
else
$nextPage = false;
}
if (!$links)
return false;
else
{
$this->topicLinkList = $links;
return true;
}
}
// get the links in all results
function getLinksFromSearch($url)
{
if (!$this->topicLinkList)
return false;
$url = str_replace('http://','',$url);
$total_links = count($this->topicLinkList);
print "[$total_links] topics found\n";
$pos = 0;
foreach ($this->topicLinkList as $topic => $link)
{
$pos++;
$refer = "http://".$this->forumURL."/search.php";
$get = "$link";
$nextPage=true;
$k=1;
while($nextPage===true)
{
print "[$pos/$total_links] [$k] page from [$topic]\n";
$fp = fsockopen($this->forumURL, 80, $errno, $errstr) or die('Cannot connect to forum: '.$this->forumURL);
$packet = "GET $get HTTP/1.1\r\n";
$packet .= "Host: ".$this->forumURL."\r\n";
$packet .= "User-Agent: Mozilla/5.0 (Windows; U; Windows NT 6.0; es-CL; rv:1.9.1.8) Gecko/20100202 Firefox/3.5.8 (.NET CLR 3.5.30729)\r\n";
$packet .= "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n";
$packet .= "Accept-Language: es-cl,es;q=0.8,en-us;q=0.5,en;q=0.3\r\n";
//$packet .= "Accept-Encoding: gzip,deflate\r\n";
$packet .= "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7\r\n";
//$packet .= "Keep-Alive: 300\r\n";
$packet .= "Connection: keep-alive\r\n";
$packet .="Cache-Control: no-cache\r\n";
$packet .= "Referer: $refer\r\n";
$packet .= "Cookie: ".$this->cookies."\r\n\r\n";
fwrite($fp, $packet, strlen($packet));
$res = "";
while(!feof($fp))
$res.=fread($fp,2048);
fclose($fp);
// we search for topic with its links
$s = explode("\n",$res);
foreach ($s as $v)
{
$t = array();
if(!ereg('<a href="http://'.$url.'([^"]+)"([^>]+)>([^<]+)</a>',$v,$t))
if(!ereg('<a href="'.$url.'([^"]+)"([^>]+)>([^<]+)</a>',$v,$t))
if(!ereg('>http://'.$url.'([^"]+)"([^<]+)<',$v,$t))
ereg('>'.$url.'([^"]+)"([^<]+)<',$v,$t);
if ($t[3] && $t[1])
{
print "found link: ".$t[1]."\n";
$links[$t[3]][] = $t[1];
}
}
$n = array();
ereg('<a href="([^"]+)">Next',$res,$n);
if ($n[1])
{
$nextPage = true;
$refer = $get;
$get = str_replace('&','&',$n[1]);
$k++;
}
else
$nextPage = false;
}
}
if (!$links)
return false;
else
{
$this->linkList = $links;
print_r($this->linkList);
return true;
}
}
// Export results to CSV
function getCsv()
{
}
//-------------------------
//
// Internal use
//
//-------------------------
function saveCookies($cookies)
{
foreach($cookies as $data)
{
if ($cookie)
$cookie.= "; $data";
else
$cookie= $data;
}
$this->cookies = $cookie."; __utma=128554762.1179315317.1266717935.1266717935.1266717935.1; __utmb=128554762.4.10.1266717935; __utmc=128554762; __utmz=128554762.1266717935.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)";
}
}
?>
Y lo ejecuto desde acá:
test.php
Código PHP :
<?php
require_once("phpbb_scrapper.class.php");
$scrap = new phpbb_scrapper('http://www.warez-bb.org');
print "[+] Loggin in...\n";
if ($scrap->doLogin('USUARIO','PASSWORD','http://www.warez-bb.org/login.php?sid=a667eecb5e2f6180f36fbd8ac68ecd42'))
print "[+] Logged in\n";
else
{
print "[X] Login failed\n";
die;
}
print "[+] Searching...\n";
if ($scrap->search('raduga'))
print "[+] Search completed\n";
else
{
print "[X] No result.\n";
die;
}
print "[+] Getting links...\n";
if ($scrap->getLinksFromSearch('http://gigapeta.com'))
print "[+] Got links!\n";
else
print "[X] No links found.\n";
?>
Al parecer las cookies no las estoy enviando como debe ser, o algún problema en el paquete que envío, pero al comenzar con la búsqueda ya estoy deautentificado. Al hacer el login no tengo problema, el servidor responde que me he autentificado correctamente.
Necesito hacerlo con sockets, no con cURL.
Muchas gracias por su ayuda.
Saludos.
