paste.php.lv / koda glabātuve (vēl mums ir forumi un irc kanāls)

Autors: andrisp

Komentārs: http://apblog.lv

Skatīt bez rindiņu numerācijas

  1. <?php
  2. /*
  3. * autors: andrisp
  4. * http://apblog.lv
  5. * apblog at apblog punkts lv
  6. */
  7.  
  8. $address = ''; // Å eit ievadam bloga adresi ar visu http://
  9.  
  10. $file_for_structure = 'structure.sql';
  11. $file_for_articles = 'posts.sql';
  12. $file_for_comments = 'comments.sql';
  13. $debug_file = 'debug.txt';
  14.  
  15. $debug_mode = false;
  16. // Kad ieslēgts, tad tiek logoti tikai HTTP pieprasījumi (iekš $debug_file).
  17. // Apstrādes kļūdas utt netiek logotas
  18.  
  19. set_time_limit(0);
  20. header('Content-Type: text/plain; charset=UTF-8');
  21.  
  22. if (substr($address, -1, 1) != '/') {
  23. $address .= '/';
  24. }
  25.  
  26. $debug_file = fopen($debug_file, 'a');
  27. $file_for_articles = fopen($file_for_articles, 'w');
  28. $file_for_comments = fopen($file_for_comments, 'w');
  29. $file_for_structure = fopen($file_for_structure, 'w');
  30.  
  31. genereate_create_table_sql();
  32.  
  33. $html = request($address);
  34. $archive_links = get_archive_months($html);
  35.  
  36. $article_id = 1;
  37.  
  38. foreach($archive_links as $i) {
  39.  
  40. $archive_page = request($i);
  41.  
  42. $links_to_articles = parse_archive_page($archive_page);
  43.  
  44. foreach($links_to_articles as $ii) {
  45.  
  46. $article_page = request($ii);
  47.  
  48. $article_data = parse_article_page($article_page);
  49.  
  50. echo $article_id.' '.$article_data['article']['title']."\n";
  51.  
  52. fwrite(
  53. $file_for_articles,
  54. generate_article_insert_sql($article_id, $article_data)
  55. );
  56.  
  57. fwrite(
  58. $file_for_comments,
  59. generate_comment_insert_sql($article_id, $article_data['comments'])
  60. );
  61.  
  62. $article_id++;
  63.  
  64.  
  65. }
  66.  
  67. }
  68.  
  69.  
  70. function get_archive_months($html) {
  71.  
  72. if (empty($html)) {
  73. return false;
  74. }
  75.  
  76. $sp = '/<div(?:(?:[\s\n\r\t]+[^>]+[\s\n\r\t]+)?|(?:[\s\n\r\t]+)?)?id=["\']ptb_archives["\'][^>]*>(.*?)<\/div>/ismu';
  77. preg_match($sp, $html, $m);
  78.  
  79. if (!empty($m[1])) {
  80.  
  81. $sp2 = '/href="(.*?)"/ismu';
  82. preg_match_all($sp2, $m[1], $m);
  83.  
  84. sort($m[1]);
  85.  
  86. return $m[1];
  87.  
  88. }
  89.  
  90. return false;
  91.  
  92. }
  93.  
  94. function parse_archive_page($html) {
  95.  
  96. if (empty($html)) {
  97. return false;
  98. }
  99.  
  100. $sp = '/<h3>[^><]*?<a[^><]+?href="([^"]+?)"[^><]+?rel="bookmark"[^><]*?>(.+?)<\/a>[^><]*?<\/h3>/ismu';
  101. preg_match_all($sp, $html, $m);
  102.  
  103. $article_links = array_reverse($m[1]);
  104.  
  105. return $article_links;
  106.  
  107. }
  108.  
  109. function parse_article_page($html) {
  110.  
  111. if (empty($html)) {
  112. return false;
  113. }
  114.  
  115. $sp = '/<h3>[^><]*?<a[^><]*?>(.+?)<\/a>[^><]*?<\/h3>[^><]*?<div[^><]+?class="blogpost_body">(.+?)<\/div>[^><]*?<div[^><]+?class="blogpost_info"[^><]*?>.*?([0-9]{4}\.[0-9]{2}\.[0-9]{2}\.)\s+([0-9]{2}:[0-9]{2}).*?<\/div>/ismu';
  116. preg_match($sp, $html, $a);
  117.  
  118. $article_title = $a[1];
  119. $tags = get_tags($a[2]);
  120. $article_text = remove_tags_block($a[2]);
  121. $article_date = $a[3];
  122. $article_time = $a[4];
  123.  
  124. $sp = '/<div[^><]+?class="blogpostcomment(?:\sunregisteredUserComment){0,1}"[^><]*?>[^><]+?<div[^><]+?class="blogpostcomment_intro">[^><]*?<a[^><]*?>[^><]+?<\/a>[^><]*?<a[^><]+?href="(.+?)"[^><]*?>([^><]+?)<\/a>[^><]*?@[^><]*?([0-9]{4}\-[0-9]{2}\-[0-9]{2})[^><]*?([0-9]{2}:[0-9]{2}:[0-9]{2}).+?<\/div>[^><]*?<div[^><]*?>.+?<\/div>[^><]*?<p>(.+?)<\/p>[^><]*?<\/div>/ismu';
  125. preg_match_all($sp, $html, $c);
  126.  
  127. $comment_authors = $c[2];
  128. $comment_texts = $c[5];
  129. $comment_dates = $c[3];
  130. $comment_times = $c[4];
  131. $comment_urls = $c[1];
  132.  
  133. $comments = array();
  134.  
  135. foreach ($comment_texts as $k => $v) {
  136.  
  137. $comments[] = array(
  138. 'author' => $comment_authors[$k],
  139. 'text' => $v,
  140. 'date' => $comment_dates[$k],
  141. 'time' => $comment_times[$k],
  142. 'url' => $comment_urls[$k]
  143. );
  144.  
  145. }
  146.  
  147. return array(
  148. 'article' => array(
  149. 'title' => $article_title,
  150. 'text' => $article_text,
  151. 'date' => $article_date,
  152. 'time' => $article_time,
  153. 'tags' => $tags
  154. ),
  155. 'comments' => $comments
  156. );
  157.  
  158. }
  159.  
  160. function generate_article_insert_sql($id, $article_data) {
  161.  
  162. global $file_for_articles;
  163.  
  164. $dparts = explode('.', $article_data['article']['date']);
  165.  
  166. $datetime = $dparts[0].'-'.$dparts[1].'-'.$dparts[2].' '
  167. . $article_data['article']['time'].':01';
  168.  
  169. $tags = '';
  170. if (is_array($article_data['article']['tags'])
  171. && count($article_data['article']['tags']) > 0) {
  172.  
  173. $tags = implode(',', $article_data['article']['tags']);
  174.  
  175. }
  176.  
  177.  
  178. $sql = "
  179. INSERT INTO `articles` (
  180. `id`, `datetime`, `title`, `text`, `tags`
  181. ) VALUES (
  182. ".(int)$id.",
  183. '".mysql_escape_string($datetime)."',
  184. '".mysql_escape_string($article_data['article']['title'])."',
  185. '".mysql_escape_string($article_data['article']['text'])."',
  186. '".mysql_escape_string($tags)."'
  187. );
  188. ";
  189.  
  190. return $sql;
  191.  
  192. }
  193.  
  194. function generate_comment_insert_sql($id, $comments) {
  195.  
  196. global $file_for_comments;
  197.  
  198. $sql = '';
  199.  
  200. foreach($comments as $v) {
  201.  
  202. $datetime = $v['date'].' '.$v['time'];
  203.  
  204. $sql .= "
  205. INSERT INTO `comments` (
  206. `article_id`,
  207. `datetime`,
  208. `author`,
  209. `text`,
  210. `homepage`
  211. ) VALUES (
  212. ".(int)$id.",
  213. '".mysql_escape_string($datetime)."',
  214. '".mysql_escape_string($v['author'])."',
  215. '".mysql_escape_string($v['text'])."',
  216. '".mysql_escape_string($v['url'])."'
  217. );
  218. "."\n\n";
  219.  
  220. }
  221.  
  222. return $sql;
  223.  
  224. }
  225.  
  226. function get_tags($html) {
  227.  
  228. if (empty($html)) {
  229. return false;
  230. }
  231.  
  232. $sp = '/<a[^><]+?rel="tag"[^><]*?>(.+?)<\/a>/ismu';
  233. preg_match_all($sp, $html, $c);
  234.  
  235. if (is_array($c[1]) && count($c[1]) > 0) {
  236. return $c[1];
  237. }
  238.  
  239. return false;
  240.  
  241. }
  242.  
  243. function remove_tags_block($html) {
  244.  
  245. if (empty($html)) {
  246. return false;
  247. }
  248.  
  249. $sp = '/<p[^><]+?class="blogpost_tags"[^><]*?>(:?[^><]*?<a[^><]*?>(.+)?<\/a>[^><]*?)<\/p>/ismu';
  250. return preg_replace($sp, '', $html);
  251.  
  252. }
  253.  
  254. function genereate_create_table_sql() {
  255.  
  256. global $file_for_structure;
  257.  
  258. $sql = "
  259. CREATE TABLE `blogiem_articles` (
  260. `id` int(10) unsigned NOT NULL auto_increment,
  261. `title` varchar(255) default NULL,
  262. `text` text,
  263. `datetime` datetime default NULL,
  264. `tags` varchar(255) default NULL,
  265. PRIMARY KEY (`id`)
  266. ) ENGINE=MyISAM DEFAULT CHARSET=utf8;
  267.  
  268. CREATE TABLE `blogiem_comments` (
  269. `id` int(10) unsigned NOT NULL auto_increment,
  270. `article_id` int(11) default NULL,
  271. `author` varchar(100) default NULL,
  272. `text` text,
  273. `homepage` varbinary(255) default NULL,
  274. `datetime` datetime default NULL,
  275. PRIMARY KEY (`id`)
  276. ) ENGINE=MyISAM DEFAULT CHARSET=utf8;
  277. ";
  278.  
  279. return fwrite($file_for_structure, $sql);
  280.  
  281. }
  282.  
  283. function request($link) {
  284.  
  285. global $debug_file, $debug_mode;
  286.  
  287. if (empty($link)) {
  288. return false;
  289. }
  290.  
  291. $custom_headers = array(
  292. 'User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.4) Gecko/20070515 Firefox/2.0.0.4',
  293. 'Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  294. 'Accept-Language: en-us,en;q=0.5',
  295. 'Accept-Encoding: gzip,deflate',
  296. 'Accept-Charset: UTF-8,*',
  297. 'Keep-Alive: 300',
  298. 'Connection: keep-alive',
  299. );
  300.  
  301. $session = curl_init();
  302.  
  303. if (isset($debug_mode) && $debug_mode == true) {
  304. curl_setopt($session, CURLOPT_VERBOSE, 1);
  305. }
  306.  
  307. curl_setopt($session, CURLOPT_STDERR, $debug_file);
  308. curl_setopt($session, CURLOPT_COOKIEJAR, "cookies.txt");
  309. curl_setopt($session, CURLOPT_COOKIEFILE, "cookies.txt");
  310. curl_setopt($session, CURLOPT_FOLLOWLOCATION, true);
  311. curl_setopt($session, CURLOPT_MAXREDIRS, 10);
  312. curl_setopt($session, CURLOPT_URL, $link);
  313. curl_setopt($session, CURLOPT_HEADER, false);
  314. curl_setopt($session, CURLOPT_RETURNTRANSFER, true);
  315. curl_setopt($session, CURLOPT_CONNECTTIMEOUT, 15);
  316. curl_setopt($session, CURLOPT_HTTPHEADER, $custom_headers);
  317.  
  318. $result = curl_exec($session);
  319. curl_close($session);
  320.  
  321. return $result;
  322.  
  323. }
  324. ?>

© 2003 – 2010 PHP.lv komanda. Visas tiesības ir paturētas. Izņemot saturu, kurš ir tā autora īpašums, ja nekas nenosaka savādāk.