Page Menu
Home
Phorge
Search
Configure Global Search
Log In
Files
F115015
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Award Token
Flag For Later
Size
21 KB
Referenced Files
None
Subscribers
None
View Options
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9d86357..adfc4a7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,26 +1,28 @@
# Changelog
## [Unreleased]
### Fixed
- Hashtags followed by HTML tags "a", "code" and "pre" were not detected
+- Incorrect parsing of HTML links inside HTML tags
+- Punctuation marks in the end of urls were included in the html links
## 0.2.0 - 2020-07-21
### Added
- Added a `do_parse/4` clause to skip mentions when we're already skipping something else (eg, when inside a link)
### Fixed
- Fixed a typo in the readme
### Changed
- Refactored `Linkify.Parser.parse/2` to enumerate over the types instead of the opts
- Update dependencies
## 0.1.0 - 2019-07-11
- Initial release
diff --git a/lib/linkify/parser.ex b/lib/linkify/parser.ex
index 38eca8c..dbb27fb 100644
--- a/lib/linkify/parser.ex
+++ b/lib/linkify/parser.ex
@@ -1,368 +1,369 @@
defmodule Linkify.Parser do
@moduledoc """
Module to handle parsing the the input string.
"""
alias Linkify.Builder
@invalid_url ~r/(\.\.+)|(^(\d+\.){1,2}\d+$)/
@match_url ~r{^(?:\W*)?(?<url>(?:https?:\/\/)?[\w.-]+(?:\.[\w\.-]+)+[\w\-\._~%:\/?#[\]@!\$&'\(\)\*\+,;=.]+$)}u
@match_hostname ~r{^\W*(?<scheme>https?:\/\/)?(?:[^@\n]+\\w@)?(?<host>[^:#~\/\n?]+)}u
@match_ip ~r"^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$"
# @user
# @user@example.com
@match_mention ~r"^@[a-zA-Z\d_-]+@[a-zA-Z0-9_-](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*|@[a-zA-Z\d_-]+"u
# https://www.w3.org/TR/html5/forms.html#valid-e-mail-address
@match_email ~r"^[a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$"u
@match_hashtag ~r/^(?<tag>\#[[:word:]_]*[[:alpha:]_·][[:word:]_·\p{M}]*)/u
@match_skipped_tag ~r/^(?<tag>(a|code|pre)).*>*/
@prefix_extra [
"magnet:?",
"dweb://",
"dat://",
"gopher://",
"ipfs://",
"ipns://",
"irc://",
"ircs://",
"irc6://",
"mumble://",
"ssb://"
]
@tlds "./priv/tlds.txt" |> File.read!() |> String.split("\n", trim: true) |> MapSet.new()
@default_opts %{
url: true,
validate_tld: true
}
@doc """
Parse the given string, identifying items to link.
Parses the string, replacing the matching urls with an html link.
## Examples
iex> Linkify.Parser.parse("Check out google.com")
~s{Check out <a href="http://google.com">google.com</a>}
"""
@types [:url, :email, :hashtag, :mention, :extra]
def parse(input, opts \\ %{})
def parse(input, opts) when is_binary(input), do: {input, %{}} |> parse(opts) |> elem(0)
def parse(input, list) when is_list(list), do: parse(input, Enum.into(list, %{}))
def parse(input, opts) do
opts = Map.merge(@default_opts, opts)
{buffer, user_acc} = do_parse(input, opts, {"", [], :parsing})
if opts[:iodata] do
{buffer, user_acc}
else
{IO.iodata_to_binary(buffer), user_acc}
end
end
defp accumulate(acc, buffer),
do: [buffer | acc]
defp accumulate(acc, buffer, trailing),
do: [trailing, buffer | acc]
defp do_parse({"", user_acc}, _opts, {"", acc, _}),
do: {Enum.reverse(acc), user_acc}
defp do_parse({"@" <> text, user_acc}, opts, {buffer, acc, :skip}),
do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "@"), :skip})
defp do_parse(
{"<" <> text, user_acc},
%{hashtag: true} = opts,
{"#" <> _ = buffer, acc, :parsing}
) do
{buffer, user_acc} = link(buffer, opts, user_acc)
case Regex.run(@match_skipped_tag, text, capture: [:tag]) do
[tag] ->
text = String.trim_leading(text, tag)
do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "<#{tag}"), :skip})
nil ->
do_parse({text, user_acc}, opts, {"<", acc, {:open, 1}})
end
end
defp do_parse({"<a" <> text, user_acc}, opts, {buffer, acc, :parsing}),
do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "<a"), :skip})
defp do_parse({"<pre" <> text, user_acc}, opts, {buffer, acc, :parsing}),
do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "<pre"), :skip})
defp do_parse({"<code" <> text, user_acc}, opts, {buffer, acc, :parsing}),
do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "<code"), :skip})
defp do_parse({"</a>" <> text, user_acc}, opts, {buffer, acc, :skip}),
do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "</a>"), :parsing})
defp do_parse({"</pre>" <> text, user_acc}, opts, {buffer, acc, :skip}),
do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "</pre>"), :parsing})
defp do_parse({"</code>" <> text, user_acc}, opts, {buffer, acc, :skip}),
do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "</code>"), :parsing})
defp do_parse({"<" <> text, user_acc}, opts, {"", acc, :parsing}),
do: do_parse({text, user_acc}, opts, {"<", acc, {:open, 1}})
defp do_parse({"<" <> text, user_acc}, opts, {"", acc, {:html, level}}) do
do_parse({text, user_acc}, opts, {"<", acc, {:open, level + 1}})
end
defp do_parse({">" <> text, user_acc}, opts, {buffer, acc, {:attrs, _level}}),
do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, ">"), :parsing})
defp do_parse({<<ch::8>> <> text, user_acc}, opts, {"", acc, {:attrs, level}}) do
do_parse({text, user_acc}, opts, {"", accumulate(acc, <<ch::8>>), {:attrs, level}})
end
defp do_parse({"</" <> text, user_acc}, opts, {buffer, acc, {:html, level}}) do
{buffer, user_acc} = link(buffer, opts, user_acc)
do_parse(
{text, user_acc},
opts,
{"", accumulate(acc, buffer, "</"), {:close, level}}
)
end
defp do_parse({">" <> text, user_acc}, opts, {buffer, acc, {:close, 1}}),
do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, ">"), :parsing})
defp do_parse({">" <> text, user_acc}, opts, {buffer, acc, {:close, level}}),
do:
do_parse(
{text, user_acc},
opts,
{"", accumulate(acc, buffer, ">"), {:html, level - 1}}
)
defp do_parse({text, user_acc}, opts, {buffer, acc, {:open, level}}) do
do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer), {:attrs, level}})
end
defp do_parse(
{<<char::bytes-size(1), text::binary>>, user_acc},
opts,
{buffer, acc, state}
)
when char in [" ", "\r", "\n"] do
{buffer, user_acc} = link(buffer, opts, user_acc)
do_parse(
{text, user_acc},
opts,
{"", accumulate(acc, buffer, char), state}
)
end
defp do_parse({<<ch::8>>, user_acc}, opts, {buffer, acc, state}) do
{buffer, user_acc} = link(buffer <> <<ch::8>>, opts, user_acc)
do_parse(
{"", user_acc},
opts,
{"", accumulate(acc, buffer), state}
)
end
defp do_parse({<<ch::8>> <> text, user_acc}, opts, {buffer, acc, state}),
do: do_parse({text, user_acc}, opts, {buffer <> <<ch::8>>, acc, state})
def check_and_link(:url, buffer, opts, _user_acc) do
str =
buffer
|> String.split("<")
|> List.first()
+ |> String.replace(~r/[,.;:)>]$/, "")
|> strip_parens()
if url?(str, opts) do
case @match_url |> Regex.run(str, capture: [:url]) |> hd() do
^buffer ->
link_url(buffer, opts)
url ->
buffer
|> String.split(url)
|> Enum.intersperse(link_url(url, opts))
|> if(opts[:iodata], do: & &1, else: &Enum.join(&1)).()
end
else
:nomatch
end
end
def check_and_link(:email, buffer, opts, _user_acc) do
if email?(buffer, opts), do: link_email(buffer, opts), else: :nomatch
end
def check_and_link(:mention, buffer, opts, user_acc) do
buffer
|> match_mention
|> link_mention(buffer, opts, user_acc)
end
def check_and_link(:hashtag, buffer, opts, user_acc) do
buffer
|> match_hashtag
|> link_hashtag(buffer, opts, user_acc)
end
def check_and_link(:extra, "xmpp:" <> handle, opts, _user_acc) do
if email?(handle, opts), do: link_extra("xmpp:" <> handle, opts), else: handle
end
def check_and_link(:extra, buffer, opts, _user_acc) do
if String.starts_with?(buffer, @prefix_extra), do: link_extra(buffer, opts), else: :nomatch
end
defp strip_parens("(" <> buffer) do
~r/[^\)]*/ |> Regex.run(buffer) |> hd()
end
defp strip_parens(buffer), do: buffer
def url?(buffer, opts) do
valid_url?(buffer) && Regex.match?(@match_url, buffer) && valid_tld?(buffer, opts)
end
def email?(buffer, opts) do
valid_url?(buffer) && Regex.match?(@match_email, buffer) && valid_tld?(buffer, opts)
end
defp valid_url?(url), do: !Regex.match?(@invalid_url, url)
@doc """
Validates a URL's TLD. Returns a boolean.
Will return `true` if `:validate_tld` option set to `false`.
Will skip validation and return `true` if `:validate_tld` set to `:no_scheme` and the url has a scheme.
"""
def valid_tld?(url, opts) do
[scheme, host] = Regex.run(@match_hostname, url, capture: [:scheme, :host])
cond do
opts[:validate_tld] == false ->
true
ip?(host) ->
true
# don't validate if scheme is present
opts[:validate_tld] == :no_scheme and scheme != "" ->
true
true ->
tld = host |> String.split(".") |> List.last()
MapSet.member?(@tlds, tld)
end
end
def ip?(buffer), do: Regex.match?(@match_ip, buffer)
def match_mention(buffer) do
case Regex.run(@match_mention, buffer) do
[mention] -> mention
_ -> nil
end
end
def match_hashtag(buffer) do
case Regex.run(@match_hashtag, buffer, capture: [:tag]) do
[hashtag] -> hashtag
_ -> nil
end
end
def link_hashtag(nil, _buffer, _, _user_acc), do: :nomatch
def link_hashtag(hashtag, buffer, %{hashtag_handler: hashtag_handler} = opts, user_acc) do
hashtag
|> hashtag_handler.(buffer, opts, user_acc)
|> maybe_update_buffer(hashtag, buffer)
end
def link_hashtag(hashtag, buffer, opts, _user_acc) do
hashtag
|> Builder.create_hashtag_link(buffer, opts)
|> maybe_update_buffer(hashtag, buffer)
end
def link_mention(nil, _buffer, _, _user_acc), do: :nomatch
def link_mention(mention, buffer, %{mention_handler: mention_handler} = opts, user_acc) do
mention
|> mention_handler.(buffer, opts, user_acc)
|> maybe_update_buffer(mention, buffer)
end
def link_mention(mention, buffer, opts, _user_acc) do
mention
|> Builder.create_mention_link(buffer, opts)
|> maybe_update_buffer(mention, buffer)
end
defp maybe_update_buffer(out, match, buffer) when is_binary(out) do
maybe_update_buffer({out, nil}, match, buffer)
end
defp maybe_update_buffer({out, user_acc}, match, buffer)
when match != buffer and out != buffer do
out = String.replace(buffer, match, out)
{out, user_acc}
end
defp maybe_update_buffer(out, _match, _buffer), do: out
@doc false
def link_url(buffer, opts) do
Builder.create_link(buffer, opts)
end
@doc false
def link_email(buffer, opts) do
Builder.create_email_link(buffer, opts)
end
def link_extra(buffer, opts) do
Builder.create_extra_link(buffer, opts)
end
defp link(buffer, opts, user_acc) do
Enum.reduce_while(@types, {buffer, user_acc}, fn type, _ ->
if opts[type] == true do
check_and_link_reducer(type, buffer, opts, user_acc)
else
{:cont, {buffer, user_acc}}
end
end)
end
defp check_and_link_reducer(type, buffer, opts, user_acc) do
case check_and_link(type, buffer, opts, user_acc) do
:nomatch -> {:cont, {buffer, user_acc}}
{buffer, user_acc} -> {:halt, {buffer, user_acc}}
buffer -> {:halt, {buffer, user_acc}}
end
end
end
diff --git a/test/parser_test.exs b/test/parser_test.exs
index 718be90..352f237 100644
--- a/test/parser_test.exs
+++ b/test/parser_test.exs
@@ -1,273 +1,304 @@
defmodule Linkify.ParserTest do
use ExUnit.Case, async: true
doctest Linkify.Parser
import Linkify.Parser
describe "url?/2" do
test "valid scheme true" do
valid_scheme_urls()
|> Enum.each(fn url ->
assert url?(url, scheme: true, validate_tld: true)
end)
end
test "invalid scheme true" do
invalid_scheme_urls()
|> Enum.each(fn url ->
refute url?(url, scheme: true, validate_tld: true)
end)
end
test "valid scheme false" do
valid_non_scheme_urls()
|> Enum.each(fn url ->
assert url?(url, scheme: false, validate_tld: true)
end)
end
test "invalid scheme false" do
invalid_non_scheme_urls()
|> Enum.each(fn url ->
refute url?(url, scheme: false, validate_tld: true)
end)
end
test "checks the tld for url with a scheme when validate_tld: true" do
custom_tld_scheme_urls()
|> Enum.each(fn url ->
refute url?(url, scheme: true, validate_tld: true)
end)
end
test "does not check the tld for url with a scheme when validate_tld: false" do
custom_tld_scheme_urls()
|> Enum.each(fn url ->
assert url?(url, scheme: true, validate_tld: false)
end)
end
test "does not check the tld for url with a scheme when validate_tld: :no_scheme" do
custom_tld_scheme_urls()
|> Enum.each(fn url ->
assert url?(url, scheme: true, validate_tld: :no_scheme)
end)
end
test "checks the tld for url without a scheme when validate_tld: true" do
custom_tld_non_scheme_urls()
|> Enum.each(fn url ->
refute url?(url, scheme: false, validate_tld: true)
end)
end
test "checks the tld for url without a scheme when validate_tld: :no_scheme" do
custom_tld_non_scheme_urls()
|> Enum.each(fn url ->
refute url?(url, scheme: false, validate_tld: :no_scheme)
end)
end
test "does not check the tld for url without a scheme when validate_tld: false" do
custom_tld_non_scheme_urls()
|> Enum.each(fn url ->
assert url?(url, scheme: false, validate_tld: false)
end)
end
end
describe "email?" do
test "identifies valid emails" do
valid_emails()
|> Enum.each(fn email ->
assert email?(email, [])
end)
end
test "identifies invalid emails" do
invalid_emails()
|> Enum.each(fn email ->
refute email?(email, [])
end)
end
test "does not validate tlds when validate_tld: false" do
valid_custom_tld_emails()
|> Enum.each(fn email ->
assert email?(email, validate_tld: false)
end)
end
test "validates tlds when validate_tld: true" do
valid_custom_tld_emails()
|> Enum.each(fn email ->
refute email?(email, validate_tld: true)
end)
end
end
describe "parse" do
test "handle line breakes" do
text = "google.com\r\nssss"
expected = "<a href=\"http://google.com\">google.com</a>\r\nssss"
assert parse(text) == expected
end
+ test "handle angle bracket in the end" do
+ text = "google.com <br>"
+ assert parse(text) == "<a href=\"http://google.com\">google.com</a> <br>"
+
+ text = "google.com<br>"
+ assert parse(text) == "<a href=\"http://google.com\">google.com</a><br>"
+
+ text = "google.com<"
+ assert parse(text) == "<a href=\"http://google.com\">google.com</a><"
+
+ text = "google.com>"
+ assert parse(text) == "<a href=\"http://google.com\">google.com</a>>"
+ end
+
test "does not link attributes" do
text = "Check out <a href='google.com'>google</a>"
assert parse(text) == text
text = "Check out <img src='google.com' alt='google.com'/>"
assert parse(text) == text
text = "Check out <span><img src='google.com' alt='google.com'/></span>"
assert parse(text) == text
end
test "does not link inside `<pre>` and `<code>`" do
text = "<pre>google.com</pre>"
assert parse(text) == text
text = "<code>google.com</code>"
assert parse(text) == text
text = "<pre><code>google.com</code></pre>"
assert parse(text) == text
end
test "links url inside html" do
text = "<div>google.com</div>"
expected = "<div><a href=\"http://google.com\">google.com</a></div>"
assert parse(text, class: false, rel: false) == expected
text = "Check out <div class='section'>google.com</div>"
expected =
"Check out <div class='section'><a href=\"http://google.com\">google.com</a></div>"
assert parse(text, class: false, rel: false) == expected
end
test "links url inside nested html" do
text = "<p><strong>google.com</strong></p>"
expected = "<p><strong><a href=\"http://google.com\">google.com</a></strong></p>"
assert parse(text, class: false, rel: false) == expected
end
test "html links inside html" do
text = ~s(<p><a href="http://google.com">google.com</a></p>)
assert parse(text) == text
text = ~s(<span><a href="http://google.com">google.com</a></span>)
assert parse(text) == text
text = ~s(<h1><a href="http://google.com">google.com</a></h1>)
assert parse(text) == text
text = ~s(<li><a href="http://google.com">google.com</a></li>)
assert parse(text) == text
end
test "do not link parens" do
text = " foo (https://example.com/path/folder/), bar"
expected =
" foo (<a href=\"https://example.com/path/folder/\">https://example.com/path/folder/</a>), bar"
assert parse(text, class: false, rel: false, scheme: true) == expected
text = " foo (example.com/path/folder/), bar"
expected =
" foo (<a href=\"http://example.com/path/folder/\">example.com/path/folder/</a>), bar"
assert parse(text, class: false, rel: false) == expected
end
+ test "do not link punctuation marks in the end" do
+ text = "google.com."
+ assert parse(text) == "<a href=\"http://google.com\">google.com</a>."
+
+ text = "google.com;"
+ assert parse(text) == "<a href=\"http://google.com\">google.com</a>;"
+
+ text = "google.com:"
+ assert parse(text) == "<a href=\"http://google.com\">google.com</a>:"
+
+ text = "hack google.com, please"
+ assert parse(text) == "hack <a href=\"http://google.com\">google.com</a>, please"
+
+ text = "(check out google.com)"
+ assert parse(text) == "(check out <a href=\"http://google.com\">google.com</a>)"
+ end
+
test "do not link urls" do
text = "google.com"
assert parse(text, url: false) == text
end
test "do not link `:test.test`" do
text = ":test.test"
assert parse(text, %{
scheme: true,
extra: true,
class: false,
strip_prefix: false,
new_window: false,
rel: false
}) == text
end
end
def valid_number?([list], number) do
assert List.last(list) == number
end
def valid_number?(_, _), do: false
def valid_scheme_urls,
do: [
"https://www.example.com",
"http://www2.example.com",
"http://home.example-site.com",
"http://blog.example.com",
"http://www.example.com/product",
"http://www.example.com/products?id=1&page=2",
"http://www.example.com#up",
"http://255.255.255.255",
"http://www.site.com:8008"
]
def invalid_scheme_urls,
do: [
"http://invalid.com/perl.cgi?key= | http://web-site.com/cgi-bin/perl.cgi?key1=value1&key2"
]
def valid_non_scheme_urls,
do: [
"www.example.com",
"www2.example.com",
"www.example.com:2000",
"www.example.com?abc=1",
"example.example-site.com",
"example.com",
"example.ca",
"example.tv",
"example.com:999?one=one",
"255.255.255.255",
"255.255.255.255:3000?one=1&two=2"
]
def invalid_non_scheme_urls,
do: [
"invalid.com/perl.cgi?key= | web-site.com/cgi-bin/perl.cgi?key1=value1&key2",
"invalid.",
"hi..there",
"555.555.5555"
]
def custom_tld_scheme_urls,
do: [
"http://whatever.null/",
"https://example.o/index.html",
"http://pleroma.i2p/test",
"http://misskey.loki"
]
def custom_tld_non_scheme_urls,
do: [
"whatever.null/",
"example.o/index.html",
"pleroma.i2p/test",
"misskey.loki"
]
def valid_emails, do: ["rms@ai.mit.edu", "vc@cock.li"]
def invalid_emails, do: ["rms[at]ai.mit.edu", "vc@cock", "xmpp:lain@trashserver.net"]
def valid_custom_tld_emails, do: ["guardian@33y6fjyhs3phzfjj.onion", "hi@company.null"]
end
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Wed, Nov 27, 12:21 AM (1 d, 12 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
40519
Default Alt Text
(21 KB)
Attached To
Mode
R19 linkify
Attached
Detach File
Event Timeline
Log In to Comment