<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0">
  <channel>
    <title>'Extract comments from blog' Thread RSS Feed</title>
    <link>http://www.programmersheaven.com/</link>
    <description>Contains the latest posts from the thread 'Extract comments from blog' posted on the 'Perl' forum at Programmer's Heaven.</description>
    <language>en</language>
    <copyright>Copyright 2012 Programmers Heaven</copyright>
    <pubDate>Wed, 23 May 2012 22:57:51 -0700</pubDate>
    <lastBuildDate>Wed, 23 May 2012 22:57:51 -0700</lastBuildDate>
    <generator>Argotic Syndication Framework 2007.3.0.1, http://www.codeplex.com/Argotic</generator>
    <docs>http://www.rssboard.org/rss-specification</docs>
    <ttl>360</ttl>
    <image>
      <url>http://www.programmersheaven.com/images/ph.gif</url>
      <title>Programmers Heaven</title>
      <link>http://www.programmersheaven.com/</link>
      <width>88</width>
      <height>31</height>
    </image>
    <item>
      <title>Extract comments from blog</title>
      <link>http://www.programmersheaven.com/mb/perl/413744/413744/extract-comments-from-blog/</link>
      <description>hello,&lt;br /&gt;
i need a perl script to extract just comments from a blog? thanks for any help&lt;br /&gt;</description>
      <guid isPermaLink="true">http://www.programmersheaven.com/mb/perl/413744/413744/extract-comments-from-blog/</guid>
      <pubDate>Mon, 22 Feb 2010 08:30:48 -0700</pubDate>
      <category>Perl</category>
    </item>
    <item>
      <title>Re: Extract comments from blog</title>
      <link>http://www.programmersheaven.com/mb/perl/413744/422118/re-extract-comments-from-blog/#422118</link>
      <description>Here is an example for googledocs.blogspot.com&lt;br /&gt;
It may work for another blogs from blogspot.com.&lt;br /&gt;
&lt;br /&gt;
&lt;pre class="sourcecode"&gt;#!/usr/bin/perl

use LWP::UserAgent;
use Term::ANSIColor;
use HTML::Entities;
use HTML::Strip;

$url = 'http://googledocs.blogspot.com/';  # replace here with another URL

$hs = 'HTML::Strip'-&amp;gt;new;
$lwp = 'LWP::UserAgent'-&amp;gt;new;

$lwp-&amp;gt;agent('Mozilla/5.0 (X11; Linux i686) AppleWebKit/534.23 (KHTML, like Gecko) Chrome/11.0.686.1 Safari/534.23');
$lwp-&amp;gt;timeout(10);
$lwp-&amp;gt;env_proxy;
$c = decode_entities('&amp;amp;#8410');
$content = $lwp-&amp;gt;get($url)-&amp;gt;content;

@content = split(/&amp;lt;h3&amp;gt;/, $content, 0);  # on some other blogs try /&amp;lt;h1&amp;gt;/

foreach $url (@content) {
    if ($url =~ /^[\s]*&amp;lt;a href='([^']+)'&amp;gt;([^&amp;lt;]+)/) {
        $url = $1;
        $title = decode_entities($2);
    }
    next unless $url =~ /^http:/;
    $content = $lwp-&amp;gt;get($url)-&amp;gt;content;
    if ($content =~ /\n[\s]*comments:[\s]*\n([^$c]+)Post a Comment/) {
        $comments = decode_entities($1);
    }
    my $clean_text = $hs-&amp;gt;parse($comments);
    until (not $clean_text =~ /\n\n\n/) {
        $clean_text =~ s/$&amp;amp;/\n\n/g;
    }
    print color('bold red');
    print "\n\n=&amp;gt;&amp;gt; $title\n";
    print color('reset');
    print $clean_text;
    $clean_text = '';
    $comments = '';
}
&lt;/pre&gt;&lt;br /&gt;</description>
      <guid isPermaLink="true">http://www.programmersheaven.com/mb/perl/413744/422118/re-extract-comments-from-blog/#422118</guid>
      <pubDate>Wed, 02 Mar 2011 07:29:59 -0700</pubDate>
      <category>Perl</category>
    </item>
  </channel>
</rss>
