It parses html into a clojure tree which it can then walk, stripping the plain text as it goes. You can define how you want to replace specific tags and entities, or black list tags you want to ignore completely.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns bhenry.example | |
(:require [clojure.string :as string] | |
[pl.danieljanus.tagsoup :as tags])) | |
;;============ | |
;; HTML Vacuum | |
;;============ | |
(def entities | |
{" " " " | |
" " " " | |
"<" "<" | |
"<" "<" | |
">" ">" | |
">" ">" | |
"&" "&" | |
"&" "&" | |
""" "\"" | |
""" "\"" | |
"'" "'" | |
"'" "'"}) | |
(def ent? (re-pattern "(?i)&[^;\\s]+;")) | |
(defn replace-entities [text] | |
(let [subs (re-seq ent? text)] | |
(loop [subs subs | |
text text] | |
(if (first subs) | |
(let [sub (first subs)] | |
(recur (rest subs) (string/replace text sub (get entities sub "")))) | |
text)))) | |
(def line-breaks? (re-pattern "\n[\\s]+\n")) | |
(defn replace-excessive-line-breaks [text] | |
(string/replace text line-breaks? "\n\n")) | |
(defn string [c] | |
(string/join " " (flatten c))) | |
(defn replace-anchor-tag [{:keys [attr content]}] | |
(format "%s (%s) " (string content) (:href attr))) | |
(defn replace-breaking-tag [{:keys [content]}] | |
(format "\n%s\n" (string content))) | |
(defn emphasize-text [{:keys [content]}] | |
(string/upper-case (string content))) | |
;;======================================= | |
;; The function mapped to a special-tag | |
;; should take a map with the keys: | |
;; :tag :attr :content. where :content is | |
;; a list of contents of any nested tags | |
;;======================================= | |
(def special-tags | |
{:a replace-anchor-tag | |
:hr (constantly "\n==========\n") | |
:br (constantly "\n") | |
:div replace-breaking-tag | |
:p replace-breaking-tag | |
:h1 replace-breaking-tag | |
:h2 replace-breaking-tag | |
:h3 replace-breaking-tag | |
:h4 replace-breaking-tag | |
:ul replace-breaking-tag | |
:ol replace-breaking-tag | |
:b emphasize-text | |
:i emphasize-text | |
:li (fn [{:keys [content]}] (format "- %s\n" (string content)))}) | |
;;===================================== | |
;; Blacklisted tags are ignored completely | |
;;===================================== | |
(def blacklist #{:head | |
:link | |
:style | |
:script}) | |
(def get-content (comp next next)) | |
(def get-tag first) | |
(def get-attr second) | |
(def node (juxt get-tag get-attr get-content)) | |
(defn process-content [tag attr content] | |
(let [action (get special-tags tag :content)] | |
(if (not (some blacklist [tag])) | |
(action | |
{:tag tag | |
:attr attr | |
:content (for [cont content] | |
(if (vector? cont) | |
(apply process-content (node cont)) | |
(string/trim cont)))})))) | |
(defn clj->text [converted-html] | |
(->> converted-html | |
node | |
(apply process-content) | |
flatten | |
string/join)) | |
(defn html->clj [html] | |
(tags/parse-string html)) | |
(defn strip-html [html] | |
(-> html | |
html->clj | |
clj->text | |
replace-entities | |
replace-excessive-line-breaks)) |