#!/usr/bin/perl
###############################################################################
# #
# Name : text_to_url #
# Author : Chris Koeritz #
# Rights : Copyright (C) 2005-$now by Author #
# #
# Purpose: #
# #
# Turns a text file into a web page, where the URLs in the text file #
# appear as links in the web page. #
# #
###############################################################################
# This program is free software; you can redistribute it and/or modify it #
# under the terms of the GNU General Public License as published by the Free #
# Software Foundation; either version 2 of the License or (at your option) #
# any later version. See: "http://www.gruntose.com/Info/GNU/GPL.html" for a #
# version of the License. Please send any updates to "fred@gruntose.com". #
###############################################################################
#require "filename_helper.pl";
#require "inc_num.pl";
&generate_web_page(@ARGV);
exit 0;
sub generate_web_page {
local($text_file, $web_page) = @_;
if ($text_file eq "") {
print "The first parameter must be a text file to use as input.\n";
return;
}
if (! -e $text_file) {
print "The text file that you specified does not exist.\n";
return;
}
if ($web_page eq "") {
print "The second parameter must be a web page to create.\n";
return;
}
if (-e $web_page) {
print "The web page you specified is already present--not overwriting.\n";
return;
}
open(INPUT_FILE, "<$text_file")
|| die("Could not open the text file $text_file for reading.\n");
open(OUTPUT_FILE, ">$web_page")
|| die("Could not open the web page $web_page for writing.\n");
# dump the web heading stuff out.
print OUTPUT_FILE "
Links scavenged from $text_file
unsorted
";
while () {
local($current) = $_;
chomp $current; # take CR off of end.
# take spaces off the end of the line.
while (substr($current, -1, 1) eq " ") { chop $current; }
# take spaces off the front of the line.
while (substr($current, 0, 1) eq " ") { $current = substr($current, 1); }
# this block repairs partial URLs, if there is not protocol present.
if ($current =~ /[^h][^t][^t][^p][^:][^\/\\][^\/\\]www\./) {
# repair a missing http:// in front.
$just_text = $current;
$just_text =~ s/(.*)www\.[^ ]*(.*)/\1 \2/;
#print "just text is $just_text\n";
$current =~ s/.*(www\.[^ ]*).*/http:\/\/\1/;
#print "curr is $current\n";
print OUTPUT_FILE "$just_text\n
\n";
} elsif ($current =~ /[^f][^t][^p][^:][^\/\\][^\/\\]ftp\./) {
# repair a missing ftp:// in front.
$just_text = $current;
$just_text =~ s/(.*)ftp\.[^ ]*(.*)/\1 \2/;
#print "just text is $just_text\n";
$current =~ s/.*(ftp\.[^ ]*).*/ftp:\/\/\1/;
#print "curr is $current\n";
print OUTPUT_FILE "$just_text\n
\n";
### print OUTPUT_FILE "$current
\n";
}
# look for matches to our supported URL types.
if ($current =~ /http:/) {
# treat a web URL simply by plugging it into a link definition.
$just_url = $current;
$just_url =~ s/.*(http:[^ ]*).*/\1/;
#print "just url is $just_url\n";
$just_text = $current;
$just_text =~ s/(.*)http:[^ ]*(.*)/\1 \2/;
#print "just text is $just_text\n";
print OUTPUT_FILE "$just_text\n";
print OUTPUT_FILE "
$just_url
\n";
} elsif ($current =~ /https:/) {
# treat a secure web URL simply by plugging it into a link definition.
$just_url = $current;
$just_url =~ s/.*(https:[^ ]*).*/\1/;
#print "just url is $just_url\n";
$just_text = $current;
$just_text =~ s/(.*)https:[^ ]*(.*)/\1 \2/;
#print "just text is $just_text\n";
print OUTPUT_FILE "$just_text\n";
print OUTPUT_FILE "
$just_url
\n";
} elsif ($current =~ /ftp:/) {
# treat an ftp URL simply by plugging it into a link definition.
$just_url = $current;
$just_url =~ s/.*(ftp:[^ ]*).*/\1/;
#print "just url is $just_url\n";
$just_text = $current;
$just_text =~ s/(.*)ftp:[^ ]*(.*)/\1 \2/;
#print "just text is $just_text\n";
print OUTPUT_FILE "$just_text\n";
print OUTPUT_FILE "
$just_url
\n";
# print OUTPUT_FILE "$current
\n";
} else {
# just print a regular line of text.
print OUTPUT_FILE "$current
\n";
}
}
print OUTPUT_FILE "\n\n";
close INPUT_FILE;
close OUTPUT_FILE;
}
1;