diff options
author | Matthew Lemon <matt@matthewlemon.com> | 2022-09-18 21:12:26 +0100 |
---|---|---|
committer | Matthew Lemon <matt@matthewlemon.com> | 2022-09-18 21:12:26 +0100 |
commit | c67ec9f37c46f42297a6844806debe439be290cd (patch) | |
tree | 23089bf978eccb1da6cb4918bace71ad3f884230 /journal/create_md_links_from_journal_urls.pl | |
parent | 49d443767d1290db4374ab0059382b047d902567 (diff) |
tidy up files
Diffstat (limited to 'journal/create_md_links_from_journal_urls.pl')
-rw-r--r-- | journal/create_md_links_from_journal_urls.pl | 100 |
1 files changed, 100 insertions, 0 deletions
diff --git a/journal/create_md_links_from_journal_urls.pl b/journal/create_md_links_from_journal_urls.pl new file mode 100644 index 0000000..c1801f5 --- /dev/null +++ b/journal/create_md_links_from_journal_urls.pl @@ -0,0 +1,100 @@ +use strict; +use warnings; +no warnings 'utf8'; # prevent the wide character warning you get on STDOUT +use English; +use Regexp::Common qw(URI); +use LWP::UserAgent; +use HTML::TreeBuilder 5 -weak; +use HTML::HeadParser; +use feature qw(say); + +# $| = 1; # turn on autoflush for stdout (https://stackoverflow.com/questions/40608986/print-doesnt-work-while-iterations-are-going-inside-foreach-loop) + +# How to read each file in a directory $dir + +my $numargs = $#ARGV + 1; + +sub usage { + say "Pass a search term or -all. All lines in the journal will be matched and URLs quoted will be extracted, or -all will do all URLs.."; + exit; +} + +if ($numargs != 1) { + usage(); +} + + +my @targetlines; +my $searchterm = $ARGV[0]; + +my @urls; + +my $dir = '/home/lemon/Notes/journal'; + +foreach my $f (glob("$dir/*.md")) { + # printf "%s\n", $f; + open my $fh, "<", $f or die "Cannot open that file '$f': $OS_ERROR"; + while (<$fh>) { + if ($searchterm eq "-all") { + push @targetlines, $_; + } + else { + if ($_ =~ m/$searchterm/) { + # printf " %s", $_; + push @targetlines, $_; + } + } + } + close $fh or die "can't read close file '$f': $OS_ERROR"; +} + +sub striptime { + my $url = shift; + $url =~ s/\?t=\d*//; + return $url; +} + +# +# Let's interact with the World Wide Web! +my $ua = LWP::UserAgent->new; +$ua->agent("Mozilla/8.0"); + +foreach my $line (@targetlines) { + # if ($line =~ /(http.*$)/) { + if ($line =~ m/$RE{URI}{HTTP}{-scheme => qr<https?>}{-keep}/) { + my$t = $1; + $t =~ s/\.$//; # remove the fullstop if it has one at the end + push @urls => striptime($t) + } +} + +# get rid of duplicates from array or urls +# see perlfaq4 +my %riddups = map { $_, 1 } @urls; +my @uniqueurls = keys %riddups; + +sub create_mdlink { + my ($url, $title) = @_; + if ($title eq "") { + $title = "- UKNOWN TITLE -"; + } + + return "[".$title."]"."(".$url.")" + +} + +my @mdurls; + +foreach my $url (@uniqueurls) { + my $req = HTTP::Request->new(GET => $url); + $req->header(Accept => "text/html"); + my $res = $ua->request($req); + my $p = HTML::HeadParser->new; + $p->parse($res->content) and print "not finished"; + my $title = $p->header('Title'); + push @mdurls => create_mdlink($url, $title); +} + +$, = "\n\n- "; +print "\n\n"; +print @mdurls; |