2017-07-01 23:09:35 +02:00
package sanitize
2017-05-09 09:28:29 +02:00
import (
2017-05-24 00:08:02 +02:00
"bytes"
2017-08-29 01:56:44 +02:00
"encoding/xml"
2017-05-09 09:28:29 +02:00
"html/template"
2017-08-29 01:56:44 +02:00
"io"
2017-05-24 00:08:02 +02:00
"regexp"
2017-05-10 23:30:46 +02:00
"strings"
2017-05-24 00:08:02 +02:00
2017-08-29 01:56:44 +02:00
"github.com/NyaaPantsu/nyaa/utils/log"
"github.com/frustra/bbcode"
2017-05-24 00:08:02 +02:00
"github.com/microcosm-cc/bluemonday"
md "github.com/russross/blackfriday"
"golang.org/x/net/html"
2017-05-09 09:28:29 +02:00
)
2017-05-24 00:08:02 +02:00
2017-05-10 23:30:46 +02:00
//Some default rules, plus and minus some.
var mdOptions = 0 |
md . EXTENSION_AUTOLINK |
md . EXTENSION_HARD_LINE_BREAK |
md . EXTENSION_NO_EMPTY_LINE_BEFORE_BLOCK |
md . EXTENSION_NO_INTRA_EMPHASIS |
md . EXTENSION_SPACE_HEADERS |
md . EXTENSION_STRIKETHROUGH
var htmlFlags = 0 |
md . HTML_USE_XHTML |
md . HTML_SMARTYPANTS_FRACTIONS |
md . HTML_SAFELINK |
md . HTML_NOREFERRER_LINKS |
md . HTML_HREF_TARGET_BLANK
2017-08-29 01:56:44 +02:00
type htmlTag struct {
XMLName xml . Name ` xml:"html" `
Body body ` xml:"body" `
}
type body struct {
Content string ` xml:",innerxml" `
}
2017-05-10 23:30:46 +02:00
func init ( ) {
2017-07-23 07:46:29 +02:00
HTMLMdRenderer = md . HtmlRenderer ( htmlFlags , "" , "" )
2017-08-29 01:56:44 +02:00
BBCodesRenderer = bbcode . NewCompiler ( true , true ) // autoCloseTags, ignoreUnmatchedClosingTags
BBCodesRenderer . SetTag ( "url" , func ( node * bbcode . BBCodeNode ) ( * bbcode . HTMLTag , bool ) {
out , appendExpr := bbcode . DefaultTagCompilers [ "url" ] ( node )
out . Attrs [ "rel" ] = "nofollow"
return out , appendExpr
} )
2017-05-10 23:30:46 +02:00
}
2017-05-24 09:11:13 +02:00
2017-07-23 07:46:29 +02:00
// HTMLMdRenderer render for markdown to html
var HTMLMdRenderer md . Renderer
2017-05-09 09:28:29 +02:00
2017-08-29 01:56:44 +02:00
// BBCodesRenderer render bbcodes to html
var BBCodesRenderer bbcode . Compiler
2017-05-26 12:12:52 +02:00
// MarkdownToHTML : convert markdown to html
Consistency, formatting, error checking, cleanup, and a couple bug fixes (#245)
* Checkpoint: it builds
The config, db, model, network, os, and public packages have had some
fixes to glaringly obvious flaws, dead code removed, and stylistic
changes.
* Style changes and old code removal in router
Router needs a lot of work done to its (lack of) error handling.
* Dead code removal and style changes
Now up to util/email/email.go. After I'm finished with the initial sweep
I'll go back and fix error handling and security issues. Then I'll fix
the broken API. Then I'll go through to add documentation and fix code
visibility.
* Finish dead code removal and style changes
Vendored libraries not touched. Everything still needs security fixes
and documentation. There's also one case of broken functionality.
* Fix accidental find-and-replace
* Style, error checking, saftey, bug fix changes
* Redo error checking erased during merge
* Re-add merge-erased fix. Make Safe safe.
2017-05-10 04:34:40 +02:00
// TODO: restrict certain types of markdown
2017-05-09 09:28:29 +02:00
func MarkdownToHTML ( markdown string ) template . HTML {
2017-06-14 13:27:05 +02:00
if len ( markdown ) >= 4 && markdown [ : 4 ] == ">" {
2017-06-14 12:10:03 +02:00
markdown = ">" + markdown [ 4 : ]
2017-05-10 23:30:46 +02:00
}
2017-05-24 09:11:13 +02:00
markdown = strings . Replace ( markdown , "\n>" , "\n>" , - 1 )
2017-07-23 07:46:29 +02:00
unsafe := md . MarkdownOptions ( [ ] byte ( markdown ) , HTMLMdRenderer , md . Options { Extensions : mdOptions } )
2017-05-09 09:28:29 +02:00
html := bluemonday . UGCPolicy ( ) . SanitizeBytes ( unsafe )
return template . HTML ( html )
}
2017-05-23 22:09:20 +02:00
2017-05-26 12:12:52 +02:00
// Sanitize :
/ * Sanitize a message passed as a string according to a setted model or allowing a set of html tags and output a string
2017-05-23 22:09:20 +02:00
* /
func Sanitize ( msg string , elements ... string ) string {
2017-08-29 01:56:44 +02:00
// Convert BBCodes to HTML
msg = ParseBBCodes ( msg )
// Repair HTML
2017-05-24 00:08:02 +02:00
msg = repairHTMLTags ( msg ) // We repair possible broken html tags
2017-08-29 01:56:44 +02:00
// HTML Sanitize
2017-05-24 00:08:02 +02:00
p := bluemonday . NewPolicy ( )
2017-05-23 22:09:20 +02:00
if len ( elements ) > 0 {
2017-05-24 00:08:02 +02:00
if elements [ 0 ] == "default" { // default model same as UGC without div
///////////////////////
// Global attributes //
///////////////////////
// "class" is not permitted as we are not allowing users to style their own
// content
p . AllowStandardAttributes ( )
//////////////////////////////
// Global URL format policy //
//////////////////////////////
p . AllowStandardURLs ( )
////////////////////////////////
// Declarations and structure //
////////////////////////////////
// "xml" "xslt" "DOCTYPE" "html" "head" are not permitted as we are
// expecting user generated content to be a fragment of HTML and not a full
// document.
//////////////////////////
// Sectioning root tags //
//////////////////////////
// "article" and "aside" are permitted and takes no attributes
p . AllowElements ( "article" , "aside" )
// "body" is not permitted as we are expecting user generated content to be a fragment
// of HTML and not a full document.
// "details" is permitted, including the "open" attribute which can either
// be blank or the value "open".
p . AllowAttrs (
"open" ,
) . Matching ( regexp . MustCompile ( ` (?i)^(|open)$ ` ) ) . OnElements ( "details" )
// "fieldset" is not permitted as we are not allowing forms to be created.
// "figure" is permitted and takes no attributes
p . AllowElements ( "figure" )
// "nav" is not permitted as it is assumed that the site (and not the user)
// has defined navigation elements
// "section" is permitted and takes no attributes
p . AllowElements ( "section" )
// "summary" is permitted and takes no attributes
p . AllowElements ( "summary" )
//////////////////////////
// Headings and footers //
//////////////////////////
// "footer" is not permitted as we expect user content to be a fragment and
// not structural to this extent
// "h1" through "h6" are permitted and take no attributes
p . AllowElements ( "h1" , "h2" , "h3" , "h4" , "h5" , "h6" )
// "header" is not permitted as we expect user content to be a fragment and
// not structural to this extent
// "hgroup" is permitted and takes no attributes
p . AllowElements ( "hgroup" )
/////////////////////////////////////
// Content grouping and separating //
/////////////////////////////////////
// "blockquote" is permitted, including the "cite" attribute which must be
// a standard URL.
p . AllowAttrs ( "cite" ) . OnElements ( "blockquote" )
// "br" "div" "hr" "p" "span" "wbr" are permitted and take no attributes
p . AllowElements ( "br" , "hr" , "p" , "span" , "wbr" )
///////////
// Links //
///////////
// "a" is permitted
p . AllowAttrs ( "href" ) . OnElements ( "a" )
// "area" is permitted along with the attributes that map image maps work
p . AllowAttrs ( "name" ) . Matching (
regexp . MustCompile ( ` ^([\p { L}\p { N}_-]+)$ ` ) ,
) . OnElements ( "map" )
p . AllowAttrs ( "alt" ) . Matching ( bluemonday . Paragraph ) . OnElements ( "area" )
p . AllowAttrs ( "coords" ) . Matching (
regexp . MustCompile ( ` ^([0-9]+,)+[0-9]+$ ` ) ,
) . OnElements ( "area" )
p . AllowAttrs ( "href" ) . OnElements ( "area" )
p . AllowAttrs ( "rel" ) . Matching ( bluemonday . SpaceSeparatedTokens ) . OnElements ( "area" )
p . AllowAttrs ( "shape" ) . Matching (
regexp . MustCompile ( ` (?i)^(default|circle|rect|poly)$ ` ) ,
) . OnElements ( "area" )
p . AllowAttrs ( "usemap" ) . Matching (
regexp . MustCompile ( ` (?i)^#[\p { L}\p { N}_-]+$ ` ) ,
) . OnElements ( "img" )
// "link" is not permitted
/////////////////////
// Phrase elements //
/////////////////////
// The following are all inline phrasing elements
p . AllowElements ( "abbr" , "acronym" , "cite" , "code" , "dfn" , "em" ,
"figcaption" , "mark" , "s" , "samp" , "strong" , "sub" , "sup" , "var" )
// "q" is permitted and "cite" is a URL and handled by URL policies
p . AllowAttrs ( "cite" ) . OnElements ( "q" )
// "time" is permitted
p . AllowAttrs ( "datetime" ) . Matching ( bluemonday . ISO8601 ) . OnElements ( "time" )
////////////////////
// Style elements //
////////////////////
// block and inline elements that impart no semantic meaning but style the
// document
p . AllowElements ( "b" , "i" , "pre" , "small" , "strike" , "tt" , "u" )
// "style" is not permitted as we are not yet sanitising CSS and it is an
// XSS attack vector
//////////////////////
// HTML5 Formatting //
//////////////////////
// "bdi" "bdo" are permitted
p . AllowAttrs ( "dir" ) . Matching ( bluemonday . Direction ) . OnElements ( "bdi" , "bdo" )
// "rp" "rt" "ruby" are permitted
p . AllowElements ( "rp" , "rt" , "ruby" )
///////////////////////////
// HTML5 Change tracking //
///////////////////////////
// "del" "ins" are permitted
p . AllowAttrs ( "cite" ) . Matching ( bluemonday . Paragraph ) . OnElements ( "del" , "ins" )
p . AllowAttrs ( "datetime" ) . Matching ( bluemonday . ISO8601 ) . OnElements ( "del" , "ins" )
///////////
// Lists //
///////////
p . AllowLists ( )
////////////
// Tables //
////////////
p . AllowTables ( )
///////////
// Forms //
///////////
// By and large, forms are not permitted. However there are some form
// elements that can be used to present data, and we do permit those
//
// "button" "fieldset" "input" "keygen" "label" "output" "select" "datalist"
// "textarea" "optgroup" "option" are all not permitted
// "meter" is permitted
p . AllowAttrs (
"value" ,
"min" ,
"max" ,
"low" ,
"high" ,
"optimum" ,
) . Matching ( bluemonday . Number ) . OnElements ( "meter" )
// "progress" is permitted
p . AllowAttrs ( "value" , "max" ) . Matching ( bluemonday . Number ) . OnElements ( "progress" )
//////////////////////
// Embedded content //
//////////////////////
// Vast majority not permitted
// "audio" "canvas" "embed" "iframe" "object" "param" "source" "svg" "track"
// "video" are all not permitted
p . AllowImages ( )
} else if elements [ 0 ] == "comment" {
2017-05-23 22:09:20 +02:00
p . AllowElements ( "b" , "strong" , "em" , "i" , "u" , "blockquote" , "q" )
p . AllowImages ( )
p . AllowStandardURLs ( )
p . AllowAttrs ( "cite" ) . OnElements ( "blockquote" , "q" )
p . AllowAttrs ( "href" ) . OnElements ( "a" )
p . AddTargetBlankToFullyQualifiedLinks ( true )
} else { // allowing set of html tags
p . AllowElements ( elements ... )
}
}
return p . Sanitize ( msg )
2017-05-24 00:08:02 +02:00
}
2017-08-29 01:56:44 +02:00
// repairHTMLTags Should close any opened tags and strip any empty end tags
2017-05-26 12:12:52 +02:00
func repairHTMLTags ( brokenHTML string ) string {
reader := strings . NewReader ( brokenHTML )
2017-05-24 00:08:02 +02:00
root , err := html . Parse ( reader )
2017-08-29 01:56:44 +02:00
if ! log . CheckError ( err ) {
return ""
}
var buf bytes . Buffer
w := io . Writer ( & buf )
html . Render ( w , root )
fixedHTML := htmlTag { }
err = xml . NewDecoder ( bytes . NewBuffer ( buf . Bytes ( ) ) ) . Decode ( & fixedHTML )
if ! log . CheckError ( err ) {
return ""
2017-05-24 00:08:02 +02:00
}
2017-08-29 01:56:44 +02:00
return fixedHTML . Body . Content
}
// ParseBBCodes returns the bbcode compiler with the bbcode tags to parse
func ParseBBCodes ( msg string ) string {
msg = BBCodesRenderer . Compile ( msg )
2017-09-17 21:47:17 +02:00
msg = strings . Replace ( msg , "<br>" , "\n" , - 1 )
2017-08-29 01:56:44 +02:00
// For some reason, BBCodes compiler return escaped html
// We need to unescape it
return html . UnescapeString ( msg )
2017-05-24 09:11:13 +02:00
}