Module: HtmlToPlainText
- Included in:
- Premailer
- Defined in:
- lib/premailer/html_to_plain_text.rb
Overview
Support functions for Premailer
Instance Method Summary collapse
-
#convert_to_text(html, line_length = 65, from_charset = 'UTF-8') ⇒ Object
Returns the text in UTF-8 format with all HTML tags removed.
-
#word_wrap(txt, line_length) ⇒ Object
Taken from Rails' word_wrap helper (http://api.rubyonrails.org/classes/ActionView/Helpers/TextHelper.html#method-i-word_wrap).
Instance Method Details
#convert_to_text(html, line_length = 65, from_charset = 'UTF-8') ⇒ Object
Returns the text in UTF-8 format with all HTML tags removed
TODO: add support for DL, OL
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
# File 'lib/premailer/html_to_plain_text.rb', line 10 def convert_to_text(html, line_length = 65, from_charset = 'UTF-8') txt = html # strip text ignored html. Useful for removing # headers and footers that aren't needed in the # text version txt.gsub!(/<!-- start text\/html -->.*?<!-- end text\/html -->/m, '') # replace images with their alt attributes # for img tags with "" for attribute quotes # with or without closing tag # eg. the following formats: # <img alt="" /> # <img alt=""> txt.gsub!(/<img.+?alt=\"([^\"]*)\"[^>]*\>/i, '\1') # for img tags with '' for attribute quotes # with or without closing tag # eg. the following formats: # <img alt='' /> # <img alt=''> txt.gsub!(/<img.+?alt=\'([^\']*)\'[^>]*\>/i, '\1') # links txt.gsub!(/<a\s[^\n]*?href=["'](mailto:)?([^"']*)["'][^>]*>(.*?)<\/a>/im) do |s| if $3.empty? '' elsif $3.strip.downcase == $2.strip.downcase $3.strip else $3.strip + ' ( ' + $2.strip + ' )' end end # handle headings (H1-H6) txt.gsub!(/(<\/h[1-6]>)/i, "\n\\1") # move closing tags to new lines txt.gsub!(/[\s]*<h([1-6]+)[^>]*>[\s]*(.*)[\s]*<\/h[1-6]+>/i) do |s| hlevel = $1.to_i htext = $2 htext.gsub!(/<br[\s]*\/?>/i, "\n") # handle <br>s htext.gsub!(/<\/?[^>]*>/i, '') # strip tags # determine maximum line length hlength = 0 htext.each_line { |l| llength = l.strip.length; hlength = llength if llength > hlength } hlength = line_length if hlength > line_length case hlevel when 1 # H1, asterisks above and below htext = ('*' * hlength) + "\n" + htext + "\n" + ('*' * hlength) when 2 # H1, dashes above and below htext = ('-' * hlength) + "\n" + htext + "\n" + ('-' * hlength) else # H3-H6, dashes below htext = htext + "\n" + ('-' * hlength) end "\n\n" + htext + "\n\n" end # wrap spans txt.gsub!(/(<\/span>)[\s]+(<span)/mi, '\1 \2') # lists -- TODO: should handle ordered lists txt.gsub!(/[\s]*(<li[^>]*>)[\s]*/i, '* ') # list not followed by a newline txt.gsub!(/<\/li>[\s]*(?![\n])/i, "\n") # paragraphs and line breaks txt.gsub!(/<\/p>/i, "\n\n") txt.gsub!(/<br[\/ ]*>/i, "\n") # strip remaining tags txt.gsub!(/<\/?[^>]*>/, '') # decode HTML entities he = HTMLEntities.new txt = he.decode(txt) # word wrap txt = word_wrap(txt, line_length) # remove linefeeds (\r\n and \r -> \n) txt.gsub!(/\r\n?/, "\n") # strip extra spaces txt.gsub!(/[ \t]*\302\240+[ \t]*/, " ") # non-breaking spaces -> spaces txt.gsub!(/\n[ \t]+/, "\n") # space at start of lines txt.gsub!(/[ \t]+\n/, "\n") # space at end of lines # no more than two consecutive newlines txt.gsub!(/[\n]{3,}/, "\n\n") # the word messes up the parens txt.gsub!(/\(([ \n])(http[^)]+)([\n ])\)/) do |s| ($1 == "\n" ? $1 : '' ) + '( ' + $2 + ' )' + ($3 == "\n" ? $1 : '' ) end txt.strip end |
#word_wrap(txt, line_length) ⇒ Object
Taken from Rails' word_wrap helper (http://api.rubyonrails.org/classes/ActionView/Helpers/TextHelper.html#method-i-word_wrap)
112 113 114 115 116 |
# File 'lib/premailer/html_to_plain_text.rb', line 112 def word_wrap(txt, line_length) txt.split("\n").collect do |line| line.length > line_length ? line.gsub(/(.{1,#{line_length}})(\s+|$)/, "\\1\n").strip : line end * "\n" end |