Page MenuHomePhorge

No OneTemporary

Size
21 KB
Referenced Files
None
Subscribers
None
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9d86357..adfc4a7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,26 +1,28 @@
# Changelog
## [Unreleased]
### Fixed
- Hashtags followed by HTML tags "a", "code" and "pre" were not detected
+- Incorrect parsing of HTML links inside HTML tags
+- Punctuation marks in the end of urls were included in the html links
## 0.2.0 - 2020-07-21
### Added
- Added a `do_parse/4` clause to skip mentions when we're already skipping something else (eg, when inside a link)
### Fixed
- Fixed a typo in the readme
### Changed
- Refactored `Linkify.Parser.parse/2` to enumerate over the types instead of the opts
- Update dependencies
## 0.1.0 - 2019-07-11
- Initial release
diff --git a/lib/linkify/parser.ex b/lib/linkify/parser.ex
index 38eca8c..dbb27fb 100644
--- a/lib/linkify/parser.ex
+++ b/lib/linkify/parser.ex
@@ -1,368 +1,369 @@
defmodule Linkify.Parser do
@moduledoc """
Module to handle parsing the the input string.
"""
alias Linkify.Builder
@invalid_url ~r/(\.\.+)|(^(\d+\.){1,2}\d+$)/
@match_url ~r{^(?:\W*)?(?<url>(?:https?:\/\/)?[\w.-]+(?:\.[\w\.-]+)+[\w\-\._~%:\/?#[\]@!\$&'\(\)\*\+,;=.]+$)}u
@match_hostname ~r{^\W*(?<scheme>https?:\/\/)?(?:[^@\n]+\\w@)?(?<host>[^:#~\/\n?]+)}u
@match_ip ~r"^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$"
# @user
# @user@example.com
@match_mention ~r"^@[a-zA-Z\d_-]+@[a-zA-Z0-9_-](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*|@[a-zA-Z\d_-]+"u
# https://www.w3.org/TR/html5/forms.html#valid-e-mail-address
@match_email ~r"^[a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$"u
@match_hashtag ~r/^(?<tag>\#[[:word:]_]*[[:alpha:]_·][[:word:]_·\p{M}]*)/u
@match_skipped_tag ~r/^(?<tag>(a|code|pre)).*>*/
@prefix_extra [
"magnet:?",
"dweb://",
"dat://",
"gopher://",
"ipfs://",
"ipns://",
"irc://",
"ircs://",
"irc6://",
"mumble://",
"ssb://"
]
@tlds "./priv/tlds.txt" |> File.read!() |> String.split("\n", trim: true) |> MapSet.new()
@default_opts %{
url: true,
validate_tld: true
}
@doc """
Parse the given string, identifying items to link.
Parses the string, replacing the matching urls with an html link.
## Examples
iex> Linkify.Parser.parse("Check out google.com")
~s{Check out <a href="http://google.com">google.com</a>}
"""
@types [:url, :email, :hashtag, :mention, :extra]
def parse(input, opts \\ %{})
def parse(input, opts) when is_binary(input), do: {input, %{}} |> parse(opts) |> elem(0)
def parse(input, list) when is_list(list), do: parse(input, Enum.into(list, %{}))
def parse(input, opts) do
opts = Map.merge(@default_opts, opts)
{buffer, user_acc} = do_parse(input, opts, {"", [], :parsing})
if opts[:iodata] do
{buffer, user_acc}
else
{IO.iodata_to_binary(buffer), user_acc}
end
end
defp accumulate(acc, buffer),
do: [buffer | acc]
defp accumulate(acc, buffer, trailing),
do: [trailing, buffer | acc]
defp do_parse({"", user_acc}, _opts, {"", acc, _}),
do: {Enum.reverse(acc), user_acc}
defp do_parse({"@" <> text, user_acc}, opts, {buffer, acc, :skip}),
do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "@"), :skip})
defp do_parse(
{"<" <> text, user_acc},
%{hashtag: true} = opts,
{"#" <> _ = buffer, acc, :parsing}
) do
{buffer, user_acc} = link(buffer, opts, user_acc)
case Regex.run(@match_skipped_tag, text, capture: [:tag]) do
[tag] ->
text = String.trim_leading(text, tag)
do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "<#{tag}"), :skip})
nil ->
do_parse({text, user_acc}, opts, {"<", acc, {:open, 1}})
end
end
defp do_parse({"<a" <> text, user_acc}, opts, {buffer, acc, :parsing}),
do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "<a"), :skip})
defp do_parse({"<pre" <> text, user_acc}, opts, {buffer, acc, :parsing}),
do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "<pre"), :skip})
defp do_parse({"<code" <> text, user_acc}, opts, {buffer, acc, :parsing}),
do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "<code"), :skip})
defp do_parse({"</a>" <> text, user_acc}, opts, {buffer, acc, :skip}),
do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "</a>"), :parsing})
defp do_parse({"</pre>" <> text, user_acc}, opts, {buffer, acc, :skip}),
do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "</pre>"), :parsing})
defp do_parse({"</code>" <> text, user_acc}, opts, {buffer, acc, :skip}),
do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "</code>"), :parsing})
defp do_parse({"<" <> text, user_acc}, opts, {"", acc, :parsing}),
do: do_parse({text, user_acc}, opts, {"<", acc, {:open, 1}})
defp do_parse({"<" <> text, user_acc}, opts, {"", acc, {:html, level}}) do
do_parse({text, user_acc}, opts, {"<", acc, {:open, level + 1}})
end
defp do_parse({">" <> text, user_acc}, opts, {buffer, acc, {:attrs, _level}}),
do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, ">"), :parsing})
defp do_parse({<<ch::8>> <> text, user_acc}, opts, {"", acc, {:attrs, level}}) do
do_parse({text, user_acc}, opts, {"", accumulate(acc, <<ch::8>>), {:attrs, level}})
end
defp do_parse({"</" <> text, user_acc}, opts, {buffer, acc, {:html, level}}) do
{buffer, user_acc} = link(buffer, opts, user_acc)
do_parse(
{text, user_acc},
opts,
{"", accumulate(acc, buffer, "</"), {:close, level}}
)
end
defp do_parse({">" <> text, user_acc}, opts, {buffer, acc, {:close, 1}}),
do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, ">"), :parsing})
defp do_parse({">" <> text, user_acc}, opts, {buffer, acc, {:close, level}}),
do:
do_parse(
{text, user_acc},
opts,
{"", accumulate(acc, buffer, ">"), {:html, level - 1}}
)
defp do_parse({text, user_acc}, opts, {buffer, acc, {:open, level}}) do
do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer), {:attrs, level}})
end
defp do_parse(
{<<char::bytes-size(1), text::binary>>, user_acc},
opts,
{buffer, acc, state}
)
when char in [" ", "\r", "\n"] do
{buffer, user_acc} = link(buffer, opts, user_acc)
do_parse(
{text, user_acc},
opts,
{"", accumulate(acc, buffer, char), state}
)
end
defp do_parse({<<ch::8>>, user_acc}, opts, {buffer, acc, state}) do
{buffer, user_acc} = link(buffer <> <<ch::8>>, opts, user_acc)
do_parse(
{"", user_acc},
opts,
{"", accumulate(acc, buffer), state}
)
end
defp do_parse({<<ch::8>> <> text, user_acc}, opts, {buffer, acc, state}),
do: do_parse({text, user_acc}, opts, {buffer <> <<ch::8>>, acc, state})
def check_and_link(:url, buffer, opts, _user_acc) do
str =
buffer
|> String.split("<")
|> List.first()
+ |> String.replace(~r/[,.;:)>]$/, "")
|> strip_parens()
if url?(str, opts) do
case @match_url |> Regex.run(str, capture: [:url]) |> hd() do
^buffer ->
link_url(buffer, opts)
url ->
buffer
|> String.split(url)
|> Enum.intersperse(link_url(url, opts))
|> if(opts[:iodata], do: & &1, else: &Enum.join(&1)).()
end
else
:nomatch
end
end
def check_and_link(:email, buffer, opts, _user_acc) do
if email?(buffer, opts), do: link_email(buffer, opts), else: :nomatch
end
def check_and_link(:mention, buffer, opts, user_acc) do
buffer
|> match_mention
|> link_mention(buffer, opts, user_acc)
end
def check_and_link(:hashtag, buffer, opts, user_acc) do
buffer
|> match_hashtag
|> link_hashtag(buffer, opts, user_acc)
end
def check_and_link(:extra, "xmpp:" <> handle, opts, _user_acc) do
if email?(handle, opts), do: link_extra("xmpp:" <> handle, opts), else: handle
end
def check_and_link(:extra, buffer, opts, _user_acc) do
if String.starts_with?(buffer, @prefix_extra), do: link_extra(buffer, opts), else: :nomatch
end
defp strip_parens("(" <> buffer) do
~r/[^\)]*/ |> Regex.run(buffer) |> hd()
end
defp strip_parens(buffer), do: buffer
def url?(buffer, opts) do
valid_url?(buffer) && Regex.match?(@match_url, buffer) && valid_tld?(buffer, opts)
end
def email?(buffer, opts) do
valid_url?(buffer) && Regex.match?(@match_email, buffer) && valid_tld?(buffer, opts)
end
defp valid_url?(url), do: !Regex.match?(@invalid_url, url)
@doc """
Validates a URL's TLD. Returns a boolean.
Will return `true` if `:validate_tld` option set to `false`.
Will skip validation and return `true` if `:validate_tld` set to `:no_scheme` and the url has a scheme.
"""
def valid_tld?(url, opts) do
[scheme, host] = Regex.run(@match_hostname, url, capture: [:scheme, :host])
cond do
opts[:validate_tld] == false ->
true
ip?(host) ->
true
# don't validate if scheme is present
opts[:validate_tld] == :no_scheme and scheme != "" ->
true
true ->
tld = host |> String.split(".") |> List.last()
MapSet.member?(@tlds, tld)
end
end
def ip?(buffer), do: Regex.match?(@match_ip, buffer)
def match_mention(buffer) do
case Regex.run(@match_mention, buffer) do
[mention] -> mention
_ -> nil
end
end
def match_hashtag(buffer) do
case Regex.run(@match_hashtag, buffer, capture: [:tag]) do
[hashtag] -> hashtag
_ -> nil
end
end
def link_hashtag(nil, _buffer, _, _user_acc), do: :nomatch
def link_hashtag(hashtag, buffer, %{hashtag_handler: hashtag_handler} = opts, user_acc) do
hashtag
|> hashtag_handler.(buffer, opts, user_acc)
|> maybe_update_buffer(hashtag, buffer)
end
def link_hashtag(hashtag, buffer, opts, _user_acc) do
hashtag
|> Builder.create_hashtag_link(buffer, opts)
|> maybe_update_buffer(hashtag, buffer)
end
def link_mention(nil, _buffer, _, _user_acc), do: :nomatch
def link_mention(mention, buffer, %{mention_handler: mention_handler} = opts, user_acc) do
mention
|> mention_handler.(buffer, opts, user_acc)
|> maybe_update_buffer(mention, buffer)
end
def link_mention(mention, buffer, opts, _user_acc) do
mention
|> Builder.create_mention_link(buffer, opts)
|> maybe_update_buffer(mention, buffer)
end
defp maybe_update_buffer(out, match, buffer) when is_binary(out) do
maybe_update_buffer({out, nil}, match, buffer)
end
defp maybe_update_buffer({out, user_acc}, match, buffer)
when match != buffer and out != buffer do
out = String.replace(buffer, match, out)
{out, user_acc}
end
defp maybe_update_buffer(out, _match, _buffer), do: out
@doc false
def link_url(buffer, opts) do
Builder.create_link(buffer, opts)
end
@doc false
def link_email(buffer, opts) do
Builder.create_email_link(buffer, opts)
end
def link_extra(buffer, opts) do
Builder.create_extra_link(buffer, opts)
end
defp link(buffer, opts, user_acc) do
Enum.reduce_while(@types, {buffer, user_acc}, fn type, _ ->
if opts[type] == true do
check_and_link_reducer(type, buffer, opts, user_acc)
else
{:cont, {buffer, user_acc}}
end
end)
end
defp check_and_link_reducer(type, buffer, opts, user_acc) do
case check_and_link(type, buffer, opts, user_acc) do
:nomatch -> {:cont, {buffer, user_acc}}
{buffer, user_acc} -> {:halt, {buffer, user_acc}}
buffer -> {:halt, {buffer, user_acc}}
end
end
end
diff --git a/test/parser_test.exs b/test/parser_test.exs
index 718be90..352f237 100644
--- a/test/parser_test.exs
+++ b/test/parser_test.exs
@@ -1,273 +1,304 @@
defmodule Linkify.ParserTest do
use ExUnit.Case, async: true
doctest Linkify.Parser
import Linkify.Parser
describe "url?/2" do
test "valid scheme true" do
valid_scheme_urls()
|> Enum.each(fn url ->
assert url?(url, scheme: true, validate_tld: true)
end)
end
test "invalid scheme true" do
invalid_scheme_urls()
|> Enum.each(fn url ->
refute url?(url, scheme: true, validate_tld: true)
end)
end
test "valid scheme false" do
valid_non_scheme_urls()
|> Enum.each(fn url ->
assert url?(url, scheme: false, validate_tld: true)
end)
end
test "invalid scheme false" do
invalid_non_scheme_urls()
|> Enum.each(fn url ->
refute url?(url, scheme: false, validate_tld: true)
end)
end
test "checks the tld for url with a scheme when validate_tld: true" do
custom_tld_scheme_urls()
|> Enum.each(fn url ->
refute url?(url, scheme: true, validate_tld: true)
end)
end
test "does not check the tld for url with a scheme when validate_tld: false" do
custom_tld_scheme_urls()
|> Enum.each(fn url ->
assert url?(url, scheme: true, validate_tld: false)
end)
end
test "does not check the tld for url with a scheme when validate_tld: :no_scheme" do
custom_tld_scheme_urls()
|> Enum.each(fn url ->
assert url?(url, scheme: true, validate_tld: :no_scheme)
end)
end
test "checks the tld for url without a scheme when validate_tld: true" do
custom_tld_non_scheme_urls()
|> Enum.each(fn url ->
refute url?(url, scheme: false, validate_tld: true)
end)
end
test "checks the tld for url without a scheme when validate_tld: :no_scheme" do
custom_tld_non_scheme_urls()
|> Enum.each(fn url ->
refute url?(url, scheme: false, validate_tld: :no_scheme)
end)
end
test "does not check the tld for url without a scheme when validate_tld: false" do
custom_tld_non_scheme_urls()
|> Enum.each(fn url ->
assert url?(url, scheme: false, validate_tld: false)
end)
end
end
describe "email?" do
test "identifies valid emails" do
valid_emails()
|> Enum.each(fn email ->
assert email?(email, [])
end)
end
test "identifies invalid emails" do
invalid_emails()
|> Enum.each(fn email ->
refute email?(email, [])
end)
end
test "does not validate tlds when validate_tld: false" do
valid_custom_tld_emails()
|> Enum.each(fn email ->
assert email?(email, validate_tld: false)
end)
end
test "validates tlds when validate_tld: true" do
valid_custom_tld_emails()
|> Enum.each(fn email ->
refute email?(email, validate_tld: true)
end)
end
end
describe "parse" do
test "handle line breakes" do
text = "google.com\r\nssss"
expected = "<a href=\"http://google.com\">google.com</a>\r\nssss"
assert parse(text) == expected
end
+ test "handle angle bracket in the end" do
+ text = "google.com <br>"
+ assert parse(text) == "<a href=\"http://google.com\">google.com</a> <br>"
+
+ text = "google.com<br>"
+ assert parse(text) == "<a href=\"http://google.com\">google.com</a><br>"
+
+ text = "google.com<"
+ assert parse(text) == "<a href=\"http://google.com\">google.com</a><"
+
+ text = "google.com>"
+ assert parse(text) == "<a href=\"http://google.com\">google.com</a>>"
+ end
+
test "does not link attributes" do
text = "Check out <a href='google.com'>google</a>"
assert parse(text) == text
text = "Check out <img src='google.com' alt='google.com'/>"
assert parse(text) == text
text = "Check out <span><img src='google.com' alt='google.com'/></span>"
assert parse(text) == text
end
test "does not link inside `<pre>` and `<code>`" do
text = "<pre>google.com</pre>"
assert parse(text) == text
text = "<code>google.com</code>"
assert parse(text) == text
text = "<pre><code>google.com</code></pre>"
assert parse(text) == text
end
test "links url inside html" do
text = "<div>google.com</div>"
expected = "<div><a href=\"http://google.com\">google.com</a></div>"
assert parse(text, class: false, rel: false) == expected
text = "Check out <div class='section'>google.com</div>"
expected =
"Check out <div class='section'><a href=\"http://google.com\">google.com</a></div>"
assert parse(text, class: false, rel: false) == expected
end
test "links url inside nested html" do
text = "<p><strong>google.com</strong></p>"
expected = "<p><strong><a href=\"http://google.com\">google.com</a></strong></p>"
assert parse(text, class: false, rel: false) == expected
end
test "html links inside html" do
text = ~s(<p><a href="http://google.com">google.com</a></p>)
assert parse(text) == text
text = ~s(<span><a href="http://google.com">google.com</a></span>)
assert parse(text) == text
text = ~s(<h1><a href="http://google.com">google.com</a></h1>)
assert parse(text) == text
text = ~s(<li><a href="http://google.com">google.com</a></li>)
assert parse(text) == text
end
test "do not link parens" do
text = " foo (https://example.com/path/folder/), bar"
expected =
" foo (<a href=\"https://example.com/path/folder/\">https://example.com/path/folder/</a>), bar"
assert parse(text, class: false, rel: false, scheme: true) == expected
text = " foo (example.com/path/folder/), bar"
expected =
" foo (<a href=\"http://example.com/path/folder/\">example.com/path/folder/</a>), bar"
assert parse(text, class: false, rel: false) == expected
end
+ test "do not link punctuation marks in the end" do
+ text = "google.com."
+ assert parse(text) == "<a href=\"http://google.com\">google.com</a>."
+
+ text = "google.com;"
+ assert parse(text) == "<a href=\"http://google.com\">google.com</a>;"
+
+ text = "google.com:"
+ assert parse(text) == "<a href=\"http://google.com\">google.com</a>:"
+
+ text = "hack google.com, please"
+ assert parse(text) == "hack <a href=\"http://google.com\">google.com</a>, please"
+
+ text = "(check out google.com)"
+ assert parse(text) == "(check out <a href=\"http://google.com\">google.com</a>)"
+ end
+
test "do not link urls" do
text = "google.com"
assert parse(text, url: false) == text
end
test "do not link `:test.test`" do
text = ":test.test"
assert parse(text, %{
scheme: true,
extra: true,
class: false,
strip_prefix: false,
new_window: false,
rel: false
}) == text
end
end
def valid_number?([list], number) do
assert List.last(list) == number
end
def valid_number?(_, _), do: false
def valid_scheme_urls,
do: [
"https://www.example.com",
"http://www2.example.com",
"http://home.example-site.com",
"http://blog.example.com",
"http://www.example.com/product",
"http://www.example.com/products?id=1&page=2",
"http://www.example.com#up",
"http://255.255.255.255",
"http://www.site.com:8008"
]
def invalid_scheme_urls,
do: [
"http://invalid.com/perl.cgi?key= | http://web-site.com/cgi-bin/perl.cgi?key1=value1&key2"
]
def valid_non_scheme_urls,
do: [
"www.example.com",
"www2.example.com",
"www.example.com:2000",
"www.example.com?abc=1",
"example.example-site.com",
"example.com",
"example.ca",
"example.tv",
"example.com:999?one=one",
"255.255.255.255",
"255.255.255.255:3000?one=1&two=2"
]
def invalid_non_scheme_urls,
do: [
"invalid.com/perl.cgi?key= | web-site.com/cgi-bin/perl.cgi?key1=value1&key2",
"invalid.",
"hi..there",
"555.555.5555"
]
def custom_tld_scheme_urls,
do: [
"http://whatever.null/",
"https://example.o/index.html",
"http://pleroma.i2p/test",
"http://misskey.loki"
]
def custom_tld_non_scheme_urls,
do: [
"whatever.null/",
"example.o/index.html",
"pleroma.i2p/test",
"misskey.loki"
]
def valid_emails, do: ["rms@ai.mit.edu", "vc@cock.li"]
def invalid_emails, do: ["rms[at]ai.mit.edu", "vc@cock", "xmpp:lain@trashserver.net"]
def valid_custom_tld_emails, do: ["guardian@33y6fjyhs3phzfjj.onion", "hi@company.null"]
end

File Metadata

Mime Type
text/x-diff
Expires
Wed, Nov 27, 12:21 AM (1 d, 12 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
40519
Default Alt Text
(21 KB)

Event Timeline