summaryrefslogtreecommitdiffstats
path: root/nb2nikola.pl
blob: 46820faee9e327c4e0c825c097148ca95c6a21c4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/bin/perl
#
# nb2nikola --- a conversion script from NanoBlogger to nikola
#
# Released under the HOT-BEVERAGE-OF-MY-CHOICE LICENSE: Solomon Peachy wrote
# this script. As long you retain this notice, you can do whatever you want
# with it. If we meet some day, and you feel like it, you can buy me a hot
# beverage of my choice in return.

use strict;
use warnings;

use HTML::WikiConverter::Markdown;
use Date::Manip;

my $input_directory = "/home/pizza/nanoblogger-3.5-rc1/pizza/data";
opendir(IN, $input_directory) or die "Unable to open input directory";

my @files = readdir(IN);

# Identify database files and store tags for the filenames; the tags are forced
# to become lowercase

my %tags = ();
foreach my $file (@files)
{
	if($file =~ m/\.db$/i && !($file eq "master.db"))
	{
		open(DB, $input_directory . "/" . $file) or die "Unable to open database file";

		my $category = lc(<DB>); # Category is always the first line of the file
		chomp($category);
		$category =~ tr/. /_/;

		foreach my $article (<DB>)
		{
			# Ignore assignments of multiple tags, i.e. foo.txt>1,3. I only require the filename.
			$article =~ m/(.*\.txt).*/;

			if(exists($tags{$1})) {
				$tags{$1} .= ", $category";
			} else {
				$tags{$1} = $category;
			}
		}

		close(DB);
	}
}

# Process articles

my $wc = new HTML::WikiConverter(	dialect		=> 'Markdown',
					link_style	=> 'inline',
					image_style	=> 'inline',
					header_style	=> 'atx');

foreach my $file (@files)
{
	if($file =~ m/\.txt$/i)
	{
		open(ARTICLE, $input_directory . "/" . $file) or die "Unable to open article file";

		# This will store the lines that belong to the actual content
		# of the article
		my $raw_article;

		my $title = "";
		my $author = "";
		my $desc = "";
		my $date = "";
		my $format = "raw";
		my $slug;
		foreach my $line (<ARTICLE>) {
		    chomp($line);
		    $line =~ s/\s+$//; # remove trailing whitespaces
		    if ($line =~ /TITLE:\s*(.*)/) {
			$title = $1;
			$slug = sanitize(lc($1));
			next;
		    }
		    if ($line =~ /AUTHOR:\s*(.*)/) {
			$author = $1;
			next;
		    }
		    if ($line =~ /DATE:\s*(.*)/) {
			$date = $1;
			$date =~ tr/@/ /;
			$date = &ParseDateString($date);
			$date = &UnixDate($date, "%Y/%m/%d %T");
#			$date = &UnixDate($date, "%Y-%m-%d %T");
#			$date = &UnixDate($date, "%B %d, %Y @ %H:%M %Z");
			next;
		    }
		    if ($line =~ /DESC:\s*(.*)/) {
			$desc = $1;
			next;
		    }
		    if ($line =~ /FORMAT:\s*(.*)/) {
			$format = $1;
			next;
		    }

		    # Article delimiters are hardcoded -- works for me...
		    if($line =~ m/BODY\:$/ or $line =~ m/END(-){5}$/ or $line =~ m/(-){5}$/)
		    {
			next;
		    }
		    
		    $raw_article .= $line . "\n";
		}

		close(ARTICLE);

		# Full article is created by prepending the title and appendig
		# the stored tags

		open(OUT, ">00-" . sanitize($title) . ".md");
		print OUT "<!--\n";
		print OUT ".. link: \n";
		print OUT ".. description: $desc\n";
		print OUT ".. tags: $tags{$file}\n";
		print OUT ".. date: $date\n";
		print OUT ".. title: $title\n";
		print OUT ".. slug: $slug\n";
		print OUT ".. type: text\n";
		print OUT "-->\n\n";

		if ($format eq 'markdown') {
		    print OUT $raw_article;
		} else {
		    print OUT $wc->html2wiki($raw_article);
		}
		close(OUT);    
	}
}

closedir(IN);

# Sanitizes a filename, following the example of Wordpress: 
# 	* Convert to lowercase
#	* Remove non-alphanumeric characters
#	* Replace spaces and dashes with underscores
#	* Replace adjacent underscores with a single underscore
#	* Remove a trailing underscore 

sub sanitize
{
	my ($file) = @_;
	my $sanitized = lc($file);

	$sanitized =~ s/[^0-9a-z_\- \t\n\r\f]//g;
	$sanitized =~ s/[\s\-]/_/g;
	$sanitized =~ s/__+/_/g;
	$sanitized =~ s/_$//;

	return($sanitized);
}