No OneTemporary
Actions

Size

21 KB

Referenced Files

None

Subscribers

None

View Options

	diff --git a/CHANGELOG.md b/CHANGELOG.md
	index 9d86357..adfc4a7 100644
	--- a/CHANGELOG.md
	+++ b/CHANGELOG.md
	@@ -1,26 +1,28 @@
	# Changelog

	## [Unreleased]

	### Fixed

	- Hashtags followed by HTML tags "a", "code" and "pre" were not detected
	+- Incorrect parsing of HTML links inside HTML tags
	+- Punctuation marks in the end of urls were included in the html links

	## 0.2.0 - 2020-07-21

	### Added

	- Added a `do_parse/4` clause to skip mentions when we're already skipping something else (eg, when inside a link)

	### Fixed

	- Fixed a typo in the readme

	### Changed

	- Refactored `Linkify.Parser.parse/2` to enumerate over the types instead of the opts
	- Update dependencies

	## 0.1.0 - 2019-07-11

	- Initial release
	diff --git a/lib/linkify/parser.ex b/lib/linkify/parser.ex
	index 38eca8c..dbb27fb 100644
	--- a/lib/linkify/parser.ex
	+++ b/lib/linkify/parser.ex
	@@ -1,368 +1,369 @@
	defmodule Linkify.Parser do
	@moduledoc """
	Module to handle parsing the the input string.
	"""

	alias Linkify.Builder

	@invalid_url ~r/(\.\.+)\|(^(\d+\.){1,2}\d+$)/

	@match_url ~r{^(?:\W)?(?<url>(?:https?:\/\/)?[\w.-]+(?:\.[\w\.-]+)+[\w\-\._~%:\/?#[\]@!\$&'\\+,;=.]+$)}u

	@match_hostname ~r{^\W*(?<scheme>https?:\/\/)?(?:[^@\n]+\\w@)?(?<host>[^:#~\/\n?]+)}u

	@match_ip ~r"^(([0-9]\|[1-9][0-9]\|1[0-9]{2}\|2[0-4][0-9]\|25[0-5])\.){3}([0-9]\|[1-9][0-9]\|1[0-9]{2}\|2[0-4][0-9]\|25[0-5])$"

	# @user
	# @user@example.com
	@match_mention ~r"^@[a-zA-Z\d_-]+@[a-zA-Z0-9_-](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*\|@[a-zA-Z\d_-]+"u

	# https://www.w3.org/TR/html5/forms.html#valid-e-mail-address
	@match_email ~r"^[a-zA-Z0-9.!#$%&'+\/=?^_`{\|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)$"u

	@match_hashtag ~r/^(?<tag>\#[[:word:]_][[:alpha:]_·][[:word:]_·\p{M}])/u

	@match_skipped_tag ~r/^(?<tag>(a\|code\|pre)).>/

	@prefix_extra [
	"magnet:?",
	"dweb://",
	"dat://",
	"gopher://",
	"ipfs://",
	"ipns://",
	"irc://",
	"ircs://",
	"irc6://",
	"mumble://",
	"ssb://"
	]

	@tlds "./priv/tlds.txt" \|> File.read!() \|> String.split("\n", trim: true) \|> MapSet.new()

	@default_opts %{
	url: true,
	validate_tld: true
	}

	@doc """
	Parse the given string, identifying items to link.

	Parses the string, replacing the matching urls with an html link.

	## Examples

	iex> Linkify.Parser.parse("Check out google.com")
	~s{Check out <a href="http://google.com">google.com</a>}
	"""

	@types [:url, :email, :hashtag, :mention, :extra]

	def parse(input, opts \\ %{})
	def parse(input, opts) when is_binary(input), do: {input, %{}} \|> parse(opts) \|> elem(0)
	def parse(input, list) when is_list(list), do: parse(input, Enum.into(list, %{}))

	def parse(input, opts) do
	opts = Map.merge(@default_opts, opts)

	{buffer, user_acc} = do_parse(input, opts, {"", [], :parsing})

	if opts[:iodata] do
	{buffer, user_acc}
	else
	{IO.iodata_to_binary(buffer), user_acc}
	end
	end

	defp accumulate(acc, buffer),
	do: [buffer \| acc]

	defp accumulate(acc, buffer, trailing),
	do: [trailing, buffer \| acc]

	defp do_parse({"", user_acc}, _opts, {"", acc, _}),
	do: {Enum.reverse(acc), user_acc}

	defp do_parse({"@" <> text, user_acc}, opts, {buffer, acc, :skip}),
	do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "@"), :skip})

	defp do_parse(
	{"<" <> text, user_acc},
	%{hashtag: true} = opts,
	{"#" <> _ = buffer, acc, :parsing}
	) do
	{buffer, user_acc} = link(buffer, opts, user_acc)

	case Regex.run(@match_skipped_tag, text, capture: [:tag]) do
	[tag] ->
	text = String.trim_leading(text, tag)
	do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "<#{tag}"), :skip})

	nil ->
	do_parse({text, user_acc}, opts, {"<", acc, {:open, 1}})
	end
	end

	defp do_parse({"<a" <> text, user_acc}, opts, {buffer, acc, :parsing}),
	do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "<a"), :skip})

	defp do_parse({"<pre" <> text, user_acc}, opts, {buffer, acc, :parsing}),
	do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "<pre"), :skip})

	defp do_parse({"<code" <> text, user_acc}, opts, {buffer, acc, :parsing}),
	do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "<code"), :skip})

	defp do_parse({"</a>" <> text, user_acc}, opts, {buffer, acc, :skip}),
	do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "</a>"), :parsing})

	defp do_parse({"</pre>" <> text, user_acc}, opts, {buffer, acc, :skip}),
	do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "</pre>"), :parsing})

	defp do_parse({"</code>" <> text, user_acc}, opts, {buffer, acc, :skip}),
	do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "</code>"), :parsing})

	defp do_parse({"<" <> text, user_acc}, opts, {"", acc, :parsing}),
	do: do_parse({text, user_acc}, opts, {"<", acc, {:open, 1}})

	defp do_parse({"<" <> text, user_acc}, opts, {"", acc, {:html, level}}) do
	do_parse({text, user_acc}, opts, {"<", acc, {:open, level + 1}})
	end

	defp do_parse({">" <> text, user_acc}, opts, {buffer, acc, {:attrs, _level}}),
	do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, ">"), :parsing})

	defp do_parse({<<ch::8>> <> text, user_acc}, opts, {"", acc, {:attrs, level}}) do
	do_parse({text, user_acc}, opts, {"", accumulate(acc, <<ch::8>>), {:attrs, level}})
	end

	defp do_parse({"</" <> text, user_acc}, opts, {buffer, acc, {:html, level}}) do
	{buffer, user_acc} = link(buffer, opts, user_acc)

	do_parse(
	{text, user_acc},
	opts,
	{"", accumulate(acc, buffer, "</"), {:close, level}}
	)
	end

	defp do_parse({">" <> text, user_acc}, opts, {buffer, acc, {:close, 1}}),
	do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, ">"), :parsing})

	defp do_parse({">" <> text, user_acc}, opts, {buffer, acc, {:close, level}}),
	do:
	do_parse(
	{text, user_acc},
	opts,
	{"", accumulate(acc, buffer, ">"), {:html, level - 1}}
	)

	defp do_parse({text, user_acc}, opts, {buffer, acc, {:open, level}}) do
	do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer), {:attrs, level}})
	end

	defp do_parse(
	{<<char::bytes-size(1), text::binary>>, user_acc},
	opts,
	{buffer, acc, state}
	)
	when char in [" ", "\r", "\n"] do
	{buffer, user_acc} = link(buffer, opts, user_acc)

	do_parse(
	{text, user_acc},
	opts,
	{"", accumulate(acc, buffer, char), state}
	)
	end

	defp do_parse({<<ch::8>>, user_acc}, opts, {buffer, acc, state}) do
	{buffer, user_acc} = link(buffer <> <<ch::8>>, opts, user_acc)

	do_parse(
	{"", user_acc},
	opts,
	{"", accumulate(acc, buffer), state}
	)
	end

	defp do_parse({<<ch::8>> <> text, user_acc}, opts, {buffer, acc, state}),
	do: do_parse({text, user_acc}, opts, {buffer <> <<ch::8>>, acc, state})

	def check_and_link(:url, buffer, opts, _user_acc) do
	str =
	buffer
	\|> String.split("<")
	\|> List.first()
	+ \|> String.replace(~r/[,.;:)>]$/, "")
	\|> strip_parens()

	if url?(str, opts) do
	case @match_url \|> Regex.run(str, capture: [:url]) \|> hd() do
	^buffer ->
	link_url(buffer, opts)

	url ->
	buffer
	\|> String.split(url)
	\|> Enum.intersperse(link_url(url, opts))
	\|> if(opts[:iodata], do: & &1, else: &Enum.join(&1)).()
	end
	else
	:nomatch
	end
	end

	def check_and_link(:email, buffer, opts, _user_acc) do
	if email?(buffer, opts), do: link_email(buffer, opts), else: :nomatch
	end

	def check_and_link(:mention, buffer, opts, user_acc) do
	buffer
	\|> match_mention
	\|> link_mention(buffer, opts, user_acc)
	end

	def check_and_link(:hashtag, buffer, opts, user_acc) do
	buffer
	\|> match_hashtag
	\|> link_hashtag(buffer, opts, user_acc)
	end

	def check_and_link(:extra, "xmpp:" <> handle, opts, _user_acc) do
	if email?(handle, opts), do: link_extra("xmpp:" <> handle, opts), else: handle
	end

	def check_and_link(:extra, buffer, opts, _user_acc) do
	if String.starts_with?(buffer, @prefix_extra), do: link_extra(buffer, opts), else: :nomatch
	end

	defp strip_parens("(" <> buffer) do
	~r/[^\)]*/ \|> Regex.run(buffer) \|> hd()
	end

	defp strip_parens(buffer), do: buffer

	def url?(buffer, opts) do
	valid_url?(buffer) && Regex.match?(@match_url, buffer) && valid_tld?(buffer, opts)
	end

	def email?(buffer, opts) do
	valid_url?(buffer) && Regex.match?(@match_email, buffer) && valid_tld?(buffer, opts)
	end

	defp valid_url?(url), do: !Regex.match?(@invalid_url, url)

	@doc """
	Validates a URL's TLD. Returns a boolean.

	Will return `true` if `:validate_tld` option set to `false`.

	Will skip validation and return `true` if `:validate_tld` set to `:no_scheme` and the url has a scheme.
	"""
	def valid_tld?(url, opts) do
	[scheme, host] = Regex.run(@match_hostname, url, capture: [:scheme, :host])

	cond do
	opts[:validate_tld] == false ->
	true

	ip?(host) ->
	true

	# don't validate if scheme is present
	opts[:validate_tld] == :no_scheme and scheme != "" ->
	true

	true ->
	tld = host \|> String.split(".") \|> List.last()
	MapSet.member?(@tlds, tld)
	end
	end

	def ip?(buffer), do: Regex.match?(@match_ip, buffer)

	def match_mention(buffer) do
	case Regex.run(@match_mention, buffer) do
	[mention] -> mention
	_ -> nil
	end
	end

	def match_hashtag(buffer) do
	case Regex.run(@match_hashtag, buffer, capture: [:tag]) do
	[hashtag] -> hashtag
	_ -> nil
	end
	end

	def link_hashtag(nil, _buffer, _, _user_acc), do: :nomatch

	def link_hashtag(hashtag, buffer, %{hashtag_handler: hashtag_handler} = opts, user_acc) do
	hashtag
	\|> hashtag_handler.(buffer, opts, user_acc)
	\|> maybe_update_buffer(hashtag, buffer)
	end

	def link_hashtag(hashtag, buffer, opts, _user_acc) do
	hashtag
	\|> Builder.create_hashtag_link(buffer, opts)
	\|> maybe_update_buffer(hashtag, buffer)
	end

	def link_mention(nil, _buffer, _, _user_acc), do: :nomatch

	def link_mention(mention, buffer, %{mention_handler: mention_handler} = opts, user_acc) do
	mention
	\|> mention_handler.(buffer, opts, user_acc)
	\|> maybe_update_buffer(mention, buffer)
	end

	def link_mention(mention, buffer, opts, _user_acc) do
	mention
	\|> Builder.create_mention_link(buffer, opts)
	\|> maybe_update_buffer(mention, buffer)
	end

	defp maybe_update_buffer(out, match, buffer) when is_binary(out) do
	maybe_update_buffer({out, nil}, match, buffer)
	end

	defp maybe_update_buffer({out, user_acc}, match, buffer)
	when match != buffer and out != buffer do
	out = String.replace(buffer, match, out)
	{out, user_acc}
	end

	defp maybe_update_buffer(out, _match, _buffer), do: out

	@doc false
	def link_url(buffer, opts) do
	Builder.create_link(buffer, opts)
	end

	@doc false
	def link_email(buffer, opts) do
	Builder.create_email_link(buffer, opts)
	end

	def link_extra(buffer, opts) do
	Builder.create_extra_link(buffer, opts)
	end

	defp link(buffer, opts, user_acc) do
	Enum.reduce_while(@types, {buffer, user_acc}, fn type, _ ->
	if opts[type] == true do
	check_and_link_reducer(type, buffer, opts, user_acc)
	else
	{:cont, {buffer, user_acc}}
	end
	end)
	end

	defp check_and_link_reducer(type, buffer, opts, user_acc) do
	case check_and_link(type, buffer, opts, user_acc) do
	:nomatch -> {:cont, {buffer, user_acc}}
	{buffer, user_acc} -> {:halt, {buffer, user_acc}}
	buffer -> {:halt, {buffer, user_acc}}
	end
	end
	end
	diff --git a/test/parser_test.exs b/test/parser_test.exs
	index 718be90..352f237 100644
	--- a/test/parser_test.exs
	+++ b/test/parser_test.exs
	@@ -1,273 +1,304 @@
	defmodule Linkify.ParserTest do
	use ExUnit.Case, async: true
	doctest Linkify.Parser

	import Linkify.Parser

	describe "url?/2" do
	test "valid scheme true" do
	valid_scheme_urls()
	\|> Enum.each(fn url ->
	assert url?(url, scheme: true, validate_tld: true)
	end)
	end

	test "invalid scheme true" do
	invalid_scheme_urls()
	\|> Enum.each(fn url ->
	refute url?(url, scheme: true, validate_tld: true)
	end)
	end

	test "valid scheme false" do
	valid_non_scheme_urls()
	\|> Enum.each(fn url ->
	assert url?(url, scheme: false, validate_tld: true)
	end)
	end

	test "invalid scheme false" do
	invalid_non_scheme_urls()
	\|> Enum.each(fn url ->
	refute url?(url, scheme: false, validate_tld: true)
	end)
	end

	test "checks the tld for url with a scheme when validate_tld: true" do
	custom_tld_scheme_urls()
	\|> Enum.each(fn url ->
	refute url?(url, scheme: true, validate_tld: true)
	end)
	end

	test "does not check the tld for url with a scheme when validate_tld: false" do
	custom_tld_scheme_urls()
	\|> Enum.each(fn url ->
	assert url?(url, scheme: true, validate_tld: false)
	end)
	end

	test "does not check the tld for url with a scheme when validate_tld: :no_scheme" do
	custom_tld_scheme_urls()
	\|> Enum.each(fn url ->
	assert url?(url, scheme: true, validate_tld: :no_scheme)
	end)
	end

	test "checks the tld for url without a scheme when validate_tld: true" do
	custom_tld_non_scheme_urls()
	\|> Enum.each(fn url ->
	refute url?(url, scheme: false, validate_tld: true)
	end)
	end

	test "checks the tld for url without a scheme when validate_tld: :no_scheme" do
	custom_tld_non_scheme_urls()
	\|> Enum.each(fn url ->
	refute url?(url, scheme: false, validate_tld: :no_scheme)
	end)
	end

	test "does not check the tld for url without a scheme when validate_tld: false" do
	custom_tld_non_scheme_urls()
	\|> Enum.each(fn url ->
	assert url?(url, scheme: false, validate_tld: false)
	end)
	end
	end

	describe "email?" do
	test "identifies valid emails" do
	valid_emails()
	\|> Enum.each(fn email ->
	assert email?(email, [])
	end)
	end

	test "identifies invalid emails" do
	invalid_emails()
	\|> Enum.each(fn email ->
	refute email?(email, [])
	end)
	end

	test "does not validate tlds when validate_tld: false" do
	valid_custom_tld_emails()
	\|> Enum.each(fn email ->
	assert email?(email, validate_tld: false)
	end)
	end

	test "validates tlds when validate_tld: true" do
	valid_custom_tld_emails()
	\|> Enum.each(fn email ->
	refute email?(email, validate_tld: true)
	end)
	end
	end

	describe "parse" do
	test "handle line breakes" do
	text = "google.com\r\nssss"
	expected = "<a href=\"http://google.com\">google.com</a>\r\nssss"

	assert parse(text) == expected
	end

	+ test "handle angle bracket in the end" do
	+ text = "google.com <br>"
	+ assert parse(text) == "<a href=\"http://google.com\">google.com</a> <br>"
	+
	+ text = "google.com<br>"
	+ assert parse(text) == "<a href=\"http://google.com\">google.com</a><br>"
	+
	+ text = "google.com<"
	+ assert parse(text) == "<a href=\"http://google.com\">google.com</a><"
	+
	+ text = "google.com>"
	+ assert parse(text) == "<a href=\"http://google.com\">google.com</a>>"
	+ end
	+
	test "does not link attributes" do
	text = "Check out <a href='google.com'>google</a>"
	assert parse(text) == text
	text = "Check out <img src='google.com' alt='google.com'/>"
	assert parse(text) == text
	text = "Check out <span><img src='google.com' alt='google.com'/></span>"
	assert parse(text) == text
	end

	test "does not link inside `<pre>` and `<code>`" do
	text = "<pre>google.com</pre>"
	assert parse(text) == text

	text = "<code>google.com</code>"
	assert parse(text) == text

	text = "<pre><code>google.com</code></pre>"
	assert parse(text) == text
	end

	test "links url inside html" do
	text = "<div>google.com</div>"

	expected = "<div><a href=\"http://google.com\">google.com</a></div>"

	assert parse(text, class: false, rel: false) == expected

	text = "Check out <div class='section'>google.com</div>"

	expected =
	"Check out <div class='section'><a href=\"http://google.com\">google.com</a></div>"

	assert parse(text, class: false, rel: false) == expected
	end

	test "links url inside nested html" do
	text = "<p><strong>google.com</strong></p>"
	expected = "<p><strong><a href=\"http://google.com\">google.com</a></strong></p>"
	assert parse(text, class: false, rel: false) == expected
	end

	test "html links inside html" do
	text = ~s(<p><a href="http://google.com">google.com</a></p>)
	assert parse(text) == text

	text = ~s(<span><a href="http://google.com">google.com</a></span>)
	assert parse(text) == text

	text = ~s(<h1><a href="http://google.com">google.com</a></h1>)
	assert parse(text) == text

	text = ~s(<li><a href="http://google.com">google.com</a></li>)
	assert parse(text) == text
	end

	test "do not link parens" do
	text = " foo (https://example.com/path/folder/), bar"

	expected =
	" foo (<a href=\"https://example.com/path/folder/\">https://example.com/path/folder/</a>), bar"

	assert parse(text, class: false, rel: false, scheme: true) == expected

	text = " foo (example.com/path/folder/), bar"

	expected =
	" foo (<a href=\"http://example.com/path/folder/\">example.com/path/folder/</a>), bar"

	assert parse(text, class: false, rel: false) == expected
	end

	+ test "do not link punctuation marks in the end" do
	+ text = "google.com."
	+ assert parse(text) == "<a href=\"http://google.com\">google.com</a>."
	+
	+ text = "google.com;"
	+ assert parse(text) == "<a href=\"http://google.com\">google.com</a>;"
	+
	+ text = "google.com:"
	+ assert parse(text) == "<a href=\"http://google.com\">google.com</a>:"
	+
	+ text = "hack google.com, please"
	+ assert parse(text) == "hack <a href=\"http://google.com\">google.com</a>, please"
	+
	+ text = "(check out google.com)"
	+ assert parse(text) == "(check out <a href=\"http://google.com\">google.com</a>)"
	+ end
	+
	test "do not link urls" do
	text = "google.com"
	assert parse(text, url: false) == text
	end

	test "do not link `:test.test`" do
	text = ":test.test"

	assert parse(text, %{
	scheme: true,
	extra: true,
	class: false,
	strip_prefix: false,
	new_window: false,
	rel: false
	}) == text
	end
	end

	def valid_number?([list], number) do
	assert List.last(list) == number
	end

	def valid_number?(_, _), do: false

	def valid_scheme_urls,
	do: [
	"https://www.example.com",
	"http://www2.example.com",
	"http://home.example-site.com",
	"http://blog.example.com",
	"http://www.example.com/product",
	"http://www.example.com/products?id=1&page=2",
	"http://www.example.com#up",
	"http://255.255.255.255",
	"http://www.site.com:8008"
	]

	def invalid_scheme_urls,
	do: [
	"http://invalid.com/perl.cgi?key= \| http://web-site.com/cgi-bin/perl.cgi?key1=value1&key2"
	]

	def valid_non_scheme_urls,
	do: [
	"www.example.com",
	"www2.example.com",
	"www.example.com:2000",
	"www.example.com?abc=1",
	"example.example-site.com",
	"example.com",
	"example.ca",
	"example.tv",
	"example.com:999?one=one",
	"255.255.255.255",
	"255.255.255.255:3000?one=1&two=2"
	]

	def invalid_non_scheme_urls,
	do: [
	"invalid.com/perl.cgi?key= \| web-site.com/cgi-bin/perl.cgi?key1=value1&key2",
	"invalid.",
	"hi..there",
	"555.555.5555"
	]

	def custom_tld_scheme_urls,
	do: [
	"http://whatever.null/",
	"https://example.o/index.html",
	"http://pleroma.i2p/test",
	"http://misskey.loki"
	]

	def custom_tld_non_scheme_urls,
	do: [
	"whatever.null/",
	"example.o/index.html",
	"pleroma.i2p/test",
	"misskey.loki"
	]

	def valid_emails, do: ["rms@ai.mit.edu", "vc@cock.li"]
	def invalid_emails, do: ["rms[at]ai.mit.edu", "vc@cock", "xmpp:lain@trashserver.net"]
	def valid_custom_tld_emails, do: ["guardian@33y6fjyhs3phzfjj.onion", "hi@company.null"]
	end

File Metadata

Mime Type: text/x-diff
Expires: Wed, Nov 27, 12:21 AM (1 d, 12 h)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 40519
Default Alt Text: (21 KB)

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions