From bcac8134ea12369d6cf8ba7068ec78fd49a47223 Mon Sep 17 00:00:00 2001 From: Matthew Lemon Date: Tue, 13 Sep 2022 19:59:24 +0100 Subject: better processing of urls --- writing_tools/random_line.pl | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) (limited to 'writing_tools/random_line.pl') diff --git a/writing_tools/random_line.pl b/writing_tools/random_line.pl index a7f2eb8..fe19ab7 100644 --- a/writing_tools/random_line.pl +++ b/writing_tools/random_line.pl @@ -37,6 +37,13 @@ foreach my $f (glob("$dir/*.md")) { } close $fh or die "can't read close file '$f': $OS_ERROR"; } + +sub striptime { + my $url = shift; + $url =~ s/\?t=\d*//; + return $url; +} + # # Let's interact with the World Wide Web! my $ua = LWP::UserAgent->new; @@ -47,14 +54,13 @@ foreach my $line (@targetlines) { if ($line =~ m/$RE{URI}{HTTP}{-scheme => qr}{-keep}/) { my$t = $1; $t =~ s/\.$//; # remove the fullstop if it has one at the end - # print "Saving: $t\n"; - # push @urls => $t - my $req = HTTP::Request->new(GET => $t); - $req->header(Accept => "text/html"); - my $res = $ua->request($req); - my $p = HTML::HeadParser->new; - $p->parse($res->content) and print "not finished"; - print $p->header('Title'), "\n"; + push @urls => striptime($t) + # my $req = HTTP::Request->new(GET => $t); + # $req->header(Accept => "text/html"); + # my $res = $ua->request($req); + # my $p = HTML::HeadParser->new; + # $p->parse($res->content) and print "not finished"; + # print $p->header('Title'), "\n"; # my $root = HTML::TreeBuilder->new_from_content($res->content); # my $title = $root->look_down('_tag' => 'title'); # my $value = $title->attr('value'); @@ -62,6 +68,19 @@ foreach my $line (@targetlines) { } +foreach my $url (@urls) { + print "URL: $url\n"; + my $req = HTTP::Request->new(GET => $url); + $req->header(Accept => "text/html"); + my $res = $ua->request($req); + my $p = HTML::HeadParser->new; + $p->parse($res->content) and print "not finished"; + print "TITLE:", $p->header('Title'), "\n"; + print "\n"; +} + + + # foreach my $url (@urls) { # print $url; # my $req = HTTP::Request->new(GET => $url); -- cgit v1.2.3