Page Menu
Home
Phorge
Search
Configure Global Search
Log In
Files
F115057
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Award Token
Flag For Later
Size
23 KB
Referenced Files
None
Subscribers
None
View Options
diff --git a/lib/linkify/parser.ex b/lib/linkify/parser.ex
index af5afef..cba70f6 100644
--- a/lib/linkify/parser.ex
+++ b/lib/linkify/parser.ex
@@ -1,473 +1,481 @@
# Copyright © 2017-2018 E-MetroTel
# Copyright © 2019-2022 Pleroma Authors
# SPDX-License-Identifier: MIT
defmodule Linkify.Parser do
@moduledoc """
Module to handle parsing the the input string.
"""
alias Linkify.Builder
@invalid_url ~r/(\.\.+)|(^(\d+\.){1,2}\d+$)/
@match_url ~r{^(?:\W*)?(?<url>(?:https?:\/\/)?[\w.-]+(?:\.[\w\.-]+)+[\w\-\._~%:\/?#[\]@!\$&'\(\)\*\+,;=.]+$)}u
@get_scheme_host ~r{^\W*(?<scheme>https?:\/\/)?(?:[^@\n]+\\w@)?(?<host>[^:#~\/\n?]+)}u
@match_hashtag ~r/^(?<tag>\#[[:word:]_]*[[:alpha:]_·\x{200c}][[:word:]_·\p{M}\x{200c}]*)/u
@match_skipped_tag ~r/^(?<tag>(a|code|pre)).*>*/
@delimiters ~r/[,.;:>?!]*$/
@en_apostrophes [
"'",
"'s",
"'ll",
"'d"
]
@prefix_extra [
"magnet:?",
"dweb://",
"dat://",
"gopher://",
"ipfs://",
"ipns://",
"irc://",
"ircs://",
"irc6://",
"mumble://",
"ssb://"
]
@tlds "./priv/tlds.txt"
|> File.read!()
|> String.split("\n", trim: true)
|> Enum.concat(["onion"])
|> MapSet.new()
@default_opts %{
url: true,
validate_tld: true
}
@doc """
Parse the given string, identifying items to link.
Parses the string, replacing the matching urls with an html link.
## Examples
iex> Linkify.Parser.parse("Check out google.com")
~s{Check out <a href="http://google.com">google.com</a>}
"""
@types [:url, :hashtag, :extra, :mention, :email]
def parse(input, opts \\ %{})
def parse(input, opts) when is_binary(input), do: {input, %{}} |> parse(opts) |> elem(0)
def parse(input, list) when is_list(list), do: parse(input, Enum.into(list, %{}))
def parse(input, opts) do
opts = Map.merge(@default_opts, opts)
{buffer, user_acc} = do_parse(input, opts, {"", [], :parsing})
if opts[:iodata] do
{buffer, user_acc}
else
{IO.iodata_to_binary(buffer), user_acc}
end
end
defp accumulate(acc, buffer),
do: [buffer | acc]
defp accumulate(acc, buffer, trailing),
do: [trailing, buffer | acc]
defp do_parse({"", user_acc}, _opts, {"", acc, _}),
do: {Enum.reverse(acc), user_acc}
defp do_parse(
{"<" <> text, user_acc},
%{hashtag: true} = opts,
{"#" <> _ = buffer, acc, :parsing}
) do
{buffer, user_acc} = link(buffer, opts, user_acc)
buffer =
case buffer do
[_, _, _] -> Enum.join(buffer)
_ -> buffer
end
case Regex.run(@match_skipped_tag, buffer, capture: [:tag]) do
[tag] ->
text = String.trim_leading(text, tag)
do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "<#{tag}"), :skip})
nil ->
do_parse({text, user_acc}, opts, {"<", accumulate(acc, buffer, ""), {:open, 1}})
end
end
defp do_parse({"<br" <> text, user_acc}, opts, {buffer, acc, :parsing}) do
{buffer, user_acc} = link(buffer, opts, user_acc)
do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "<br"), {:open, 1}})
end
defp do_parse({"<a" <> text, user_acc}, opts, {buffer, acc, :parsing}),
do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "<a"), :skip})
defp do_parse({"<pre" <> text, user_acc}, opts, {buffer, acc, :parsing}),
do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "<pre"), :skip})
defp do_parse({"<code" <> text, user_acc}, opts, {buffer, acc, :parsing}),
do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "<code"), :skip})
defp do_parse({"</a>" <> text, user_acc}, opts, {buffer, acc, :skip}),
do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "</a>"), :parsing})
defp do_parse({"</pre>" <> text, user_acc}, opts, {buffer, acc, :skip}),
do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "</pre>"), :parsing})
defp do_parse({"</code>" <> text, user_acc}, opts, {buffer, acc, :skip}),
do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "</code>"), :parsing})
defp do_parse({"<" <> text, user_acc}, opts, {"", acc, :parsing}),
do: do_parse({text, user_acc}, opts, {"<", acc, {:open, 1}})
defp do_parse({"<" <> text, user_acc}, opts, {buffer, acc, :parsing}) do
{buffer, user_acc} = link(buffer, opts, user_acc)
do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "<"), {:open, 1}})
end
defp do_parse({">" <> text, user_acc}, opts, {buffer, acc, {:attrs, _level}}),
do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, ">"), :parsing})
defp do_parse({<<ch::8>> <> text, user_acc}, opts, {"", acc, {:attrs, level}}) do
do_parse({text, user_acc}, opts, {"", accumulate(acc, <<ch::8>>), {:attrs, level}})
end
defp do_parse({text, user_acc}, opts, {buffer, acc, {:open, level}}) do
do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer), {:attrs, level}})
end
defp do_parse(
{<<char::bytes-size(1), text::binary>>, user_acc},
opts,
{buffer, acc, state}
)
when char in [" ", "\r", "\n"] do
{buffer, user_acc} = link(buffer, opts, user_acc)
do_parse(
{text, user_acc},
opts,
{"", accumulate(acc, buffer, char), state}
)
end
defp do_parse({<<ch::8>>, user_acc}, opts, {buffer, acc, state}) do
{buffer, user_acc} = link(buffer <> <<ch::8>>, opts, user_acc)
do_parse(
{"", user_acc},
opts,
{"", accumulate(acc, buffer), state}
)
end
defp do_parse({<<ch::8>> <> text, user_acc}, opts, {buffer, acc, state}),
do: do_parse({text, user_acc}, opts, {buffer <> <<ch::8>>, acc, state})
def check_and_link(:url, buffer, opts, _user_acc) do
if url?(buffer, opts) do
case @match_url |> Regex.run(buffer, capture: [:url]) |> hd() do
^buffer ->
link_url(buffer, opts)
url ->
link = link_url(url, opts)
restore_stripped_symbols(buffer, url, link)
end
else
:nomatch
end
end
def check_and_link(:email, buffer, opts, _user_acc) do
if email?(buffer, opts), do: link_email(buffer, opts), else: :nomatch
end
def check_and_link(:mention, buffer, opts, user_acc) do
buffer
|> match_mention
|> link_mention(buffer, opts, user_acc)
end
def check_and_link(:hashtag, buffer, opts, user_acc) do
buffer
|> match_hashtag
|> link_hashtag(buffer, opts, user_acc)
end
def check_and_link(:extra, "xmpp:" <> handle = buffer, opts, _user_acc) do
if email?(handle, opts), do: link_extra(buffer, opts), else: :nomatch
end
def check_and_link(:extra, buffer, opts, _user_acc) do
if String.starts_with?(buffer, @prefix_extra), do: link_extra(buffer, opts), else: :nomatch
end
defp maybe_strip_parens(buffer) do
trimmed = trim_leading_paren(buffer)
with :next <- parens_check_trailing(buffer),
:next <- parens_found_email(trimmed),
:next <- parens_found_url(trimmed),
%{path: path, query: query} = URI.parse(trimmed),
:next <- parens_in_query(query),
:next <- parens_found_path_separator(path),
:next <- parens_path_has_open_paren(path),
:next <- parens_check_balanced(trimmed) do
buffer |> trim_leading_paren |> trim_trailing_paren
else
:both -> buffer |> trim_leading_paren |> trim_trailing_paren
:leading_only -> buffer |> trim_leading_paren
:noop -> buffer
_ -> buffer
end
end
defp parens_check_trailing(buffer), do: (String.ends_with?(buffer, ")") && :next) || :noop
defp parens_found_email(trimmed),
do: (trim_trailing_paren(trimmed) |> email?(nil) && :both) || :next
defp parens_found_url(trimmed),
do: (trim_trailing_paren(trimmed) |> url?(nil) && :next) || :noop
defp parens_in_query(query), do: (is_nil(query) && :next) || :both
defp parens_found_path_separator(path) when is_nil(path), do: :next
defp parens_found_path_separator(path), do: (String.contains?(path, "/") && :next) || :both
defp parens_path_has_open_paren(path) when is_nil(path), do: :next
defp parens_path_has_open_paren(path), do: (String.contains?(path, "(") && :next) || :both
defp parens_check_balanced(trimmed) do
graphemes = String.graphemes(trimmed)
opencnt = graphemes |> Enum.count(fn x -> x == "(" end)
closecnt = graphemes |> Enum.count(fn x -> x == ")" end)
if opencnt == closecnt do
:leading_only
else
:next
end
end
defp trim_leading_paren(buffer) do
case buffer do
"(" <> buffer -> buffer
buffer -> buffer
end
end
defp trim_trailing_paren(buffer),
do:
(String.ends_with?(buffer, ")") && String.slice(buffer, 0, String.length(buffer) - 1)) ||
buffer
defp strip_punctuation(buffer), do: String.replace(buffer, @delimiters, "")
defp strip_en_apostrophes(buffer) do
Enum.reduce(@en_apostrophes, buffer, fn abbrev, buf ->
String.replace_suffix(buf, abbrev, "")
end)
end
def url?(buffer, opts) do
valid_url?(buffer) && Regex.match?(@match_url, buffer) && valid_tld?(buffer, opts)
end
def email?(buffer, opts) do
# Note: In reality the local part can only be checked by the remote server
case Regex.run(~r/^(?<user>.*)@(?<host>[^@]+)$/, buffer, capture: [:user, :host]) do
[_user, hostname] -> valid_hostname?(hostname) && valid_tld?(hostname, opts)
_ -> false
end
end
- defp valid_url?(url), do: !Regex.match?(@invalid_url, url)
+ defp valid_url?(url) do
+ with {_, [scheme]} <- {:regex, Regex.run(@get_scheme_host, url, capture: [:scheme])},
+ true <- scheme == "" do
+ !Regex.match?(@invalid_url, url)
+ else
+ _ ->
+ true
+ end
+ end
@doc """
Validates a URL's TLD. Returns a boolean.
Will return `true` if `:validate_tld` option set to `false`.
Will skip validation and return `true` if `:validate_tld` set to `:no_scheme` and the url has a scheme.
"""
def valid_tld?(url, opts) do
[scheme, host] = Regex.run(@get_scheme_host, url, capture: [:scheme, :host])
cond do
opts[:validate_tld] == false ->
true
scheme != "" && ip?(host) ->
true
# don't validate if scheme is present
opts[:validate_tld] == :no_scheme and scheme != "" ->
true
true ->
tld = host |> strip_punctuation() |> String.split(".") |> List.last()
MapSet.member?(@tlds, tld)
end
end
def safe_to_integer(string, base \\ 10) do
String.to_integer(string, base)
rescue
_ ->
nil
end
def ip?(buffer) do
case :inet.parse_strict_address(to_charlist(buffer)) do
{:error, _} -> false
{:ok, _} -> true
end
end
# IDN-compatible, ported from musl-libc's is_valid_hostname()
def valid_hostname?(hostname) do
hostname
|> String.to_charlist()
|> Enum.any?(fn s ->
!(s >= 0x80 || s in 0x30..0x39 || s in 0x41..0x5A || s in 0x61..0x7A || s in '.-')
end)
|> Kernel.!()
end
def match_mention(buffer) do
case Regex.run(~r/^@(?<user>[a-zA-Z\d_-]+)(@(?<host>[^@]+))?$/, buffer,
capture: [:user, :host]
) do
[user, ""] ->
"@" <> user
[user, hostname] ->
if valid_hostname?(hostname) && valid_tld?(hostname, []),
do: "@" <> user <> "@" <> hostname,
else: nil
_ ->
nil
end
end
def match_hashtag(buffer) do
case Regex.run(@match_hashtag, buffer, capture: [:tag]) do
[hashtag] -> hashtag
_ -> nil
end
end
def link_hashtag(nil, _buffer, _, _user_acc), do: :nomatch
def link_hashtag(hashtag, buffer, %{hashtag_handler: hashtag_handler} = opts, user_acc) do
hashtag
|> hashtag_handler.(buffer, opts, user_acc)
|> maybe_update_buffer(hashtag, buffer)
end
def link_hashtag(hashtag, buffer, opts, _user_acc) do
hashtag
|> Builder.create_hashtag_link(buffer, opts)
|> maybe_update_buffer(hashtag, buffer)
end
def link_mention(nil, _buffer, _, _user_acc), do: :nomatch
def link_mention(mention, buffer, %{mention_handler: mention_handler} = opts, user_acc) do
mention
|> mention_handler.(buffer, opts, user_acc)
|> maybe_update_buffer(mention, buffer)
end
def link_mention(mention, buffer, opts, _user_acc) do
mention
|> Builder.create_mention_link(buffer, opts)
|> maybe_update_buffer(mention, buffer)
end
defp maybe_update_buffer(out, match, buffer) when is_binary(out) do
maybe_update_buffer({out, nil}, match, buffer)
end
defp maybe_update_buffer({out, user_acc}, match, buffer)
when match != buffer and out != buffer do
out = String.replace(buffer, match, out)
{out, user_acc}
end
defp maybe_update_buffer(out, _match, _buffer), do: out
@doc false
def link_url(buffer, opts) do
Builder.create_link(buffer, opts)
end
@doc false
def link_email(buffer, opts) do
Builder.create_email_link(buffer, opts)
end
def link_extra(buffer, opts) do
Builder.create_extra_link(buffer, opts)
end
defp link(buffer, opts, user_acc) do
Enum.reduce_while(@types, {buffer, user_acc}, fn type, _ ->
if opts[type] == true do
check_and_link_reducer(type, buffer, opts, user_acc)
else
{:cont, {buffer, user_acc}}
end
end)
end
defp check_and_link_reducer(type, buffer, opts, user_acc) do
str =
buffer
|> String.split("<")
|> List.first()
|> strip_en_apostrophes()
|> strip_punctuation()
|> maybe_strip_parens()
case check_and_link(type, str, opts, user_acc) do
:nomatch ->
{:cont, {buffer, user_acc}}
{link, user_acc} ->
{:halt, {restore_stripped_symbols(buffer, str, link), user_acc}}
link ->
{:halt, {restore_stripped_symbols(buffer, str, link), user_acc}}
end
end
defp restore_stripped_symbols(buffer, buffer, link), do: link
defp restore_stripped_symbols(buffer, stripped_buffer, link) do
buffer
|> String.split(stripped_buffer)
|> Enum.intersperse(link)
end
end
diff --git a/test/parser_test.exs b/test/parser_test.exs
index 50a6e68..aafecb5 100644
--- a/test/parser_test.exs
+++ b/test/parser_test.exs
@@ -1,314 +1,319 @@
# Copyright © 2017-2018 E-MetroTel
# Copyright © 2019-2022 Pleroma Authors
# SPDX-License-Identifier: MIT
defmodule Linkify.ParserTest do
use ExUnit.Case, async: true
doctest Linkify.Parser
import Linkify.Parser
describe "url?/2" do
test "valid scheme true" do
valid_scheme_urls()
|> Enum.each(fn url ->
assert url?(url, scheme: true, validate_tld: true)
end)
end
test "invalid scheme true" do
invalid_scheme_urls()
|> Enum.each(fn url ->
refute url?(url, scheme: true, validate_tld: true)
end)
end
test "valid scheme false" do
valid_non_scheme_urls()
|> Enum.each(fn url ->
assert url?(url, scheme: false, validate_tld: true)
end)
end
test "invalid scheme false" do
invalid_non_scheme_urls()
|> Enum.each(fn url ->
refute url?(url, scheme: false, validate_tld: true)
end)
end
test "checks the tld for url with a scheme when validate_tld: true" do
custom_tld_scheme_urls()
|> Enum.each(fn url ->
refute url?(url, scheme: true, validate_tld: true)
end)
end
test "does not check the tld for url with a scheme when validate_tld: false" do
custom_tld_scheme_urls()
|> Enum.each(fn url ->
assert url?(url, scheme: true, validate_tld: false)
end)
end
test "does not check the tld for url with a scheme when validate_tld: :no_scheme" do
custom_tld_scheme_urls()
|> Enum.each(fn url ->
assert url?(url, scheme: true, validate_tld: :no_scheme)
end)
end
test "checks the tld for url without a scheme when validate_tld: true" do
custom_tld_non_scheme_urls()
|> Enum.each(fn url ->
refute url?(url, scheme: false, validate_tld: true)
end)
end
test "checks the tld for url without a scheme when validate_tld: :no_scheme" do
custom_tld_non_scheme_urls()
|> Enum.each(fn url ->
refute url?(url, scheme: false, validate_tld: :no_scheme)
end)
end
test "does not check the tld for url without a scheme when validate_tld: false" do
custom_tld_non_scheme_urls()
|> Enum.each(fn url ->
assert url?(url, scheme: false, validate_tld: false)
end)
end
end
describe "email?" do
test "identifies valid emails" do
valid_emails()
|> Enum.each(fn email ->
assert email?(email, [])
end)
end
test "identifies invalid emails" do
invalid_emails()
|> Enum.each(fn email ->
refute email?(email, [])
end)
end
test "does not validate tlds when validate_tld: false" do
valid_custom_tld_emails()
|> Enum.each(fn email ->
assert email?(email, validate_tld: false)
end)
end
test "validates tlds when validate_tld: true" do
valid_custom_tld_emails()
|> Enum.each(fn email ->
refute email?(email, validate_tld: true)
end)
end
end
describe "parse" do
test "handle line breakes" do
text = "google.com\r\nssss"
expected = "<a href=\"http://google.com\">google.com</a>\r\nssss"
assert parse(text) == expected
end
test "handle angle bracket in the end" do
text = "google.com <br>"
assert parse(text) == "<a href=\"http://google.com\">google.com</a> <br>"
text = "google.com<br>hey"
assert parse(text) == "<a href=\"http://google.com\">google.com</a><br>hey"
text = "hey<br>google.com"
assert parse(text) == "hey<br><a href=\"http://google.com\">google.com</a>"
text = "<br />google.com"
assert parse(text) == "<br /><a href=\"http://google.com\">google.com</a>"
text = "google.com<"
assert parse(text) == "<a href=\"http://google.com\">google.com</a><"
text = "google.com>"
assert parse(text) == "<a href=\"http://google.com\">google.com</a>>"
end
test "does not link attributes" do
text = "Check out <a href='google.com'>google</a>"
assert parse(text) == text
text = "Check out <img src='google.com' alt='google.com'/>"
assert parse(text) == text
text = "Check out <span><img src='google.com' alt='google.com'/></span>"
assert parse(text) == text
end
test "does not link inside `<pre>` and `<code>`" do
text = "<pre>google.com</pre>"
assert parse(text) == text
text = "<code>google.com</code>"
assert parse(text) == text
text = "<pre><code>google.com</code></pre>"
assert parse(text) == text
end
test "links url inside html" do
text = "<div>google.com</div>"
expected = "<div><a href=\"http://google.com\">google.com</a></div>"
assert parse(text, class: false, rel: false) == expected
text = "Check out <div class='section'>google.com</div>"
expected =
"Check out <div class='section'><a href=\"http://google.com\">google.com</a></div>"
assert parse(text, class: false, rel: false) == expected
end
test "links url inside nested html" do
text = "<p><strong>google.com</strong></p>"
expected = "<p><strong><a href=\"http://google.com\">google.com</a></strong></p>"
assert parse(text, class: false, rel: false) == expected
end
test "html links inside html" do
text = ~s(<p><a href="http://google.com">google.com</a></p>)
assert parse(text) == text
text = ~s(<span><a href="http://google.com">google.com</a></span>)
assert parse(text) == text
text = ~s(<h1><a href="http://google.com">google.com</a></h1>)
assert parse(text) == text
text = ~s(<li><a href="http://google.com">google.com</a></li>)
assert parse(text) == text
end
test "do not link parens" do
text = " foo (https://example.com/path/folder/), bar"
expected =
" foo (<a href=\"https://example.com/path/folder/\">https://example.com/path/folder/</a>), bar"
assert parse(text, class: false, rel: false, scheme: true) == expected
text = " foo (example.com/path/folder/), bar"
expected =
" foo (<a href=\"http://example.com/path/folder/\">example.com/path/folder/</a>), bar"
assert parse(text, class: false, rel: false) == expected
end
test "do not link punctuation marks in the end" do
text = "google.com."
assert parse(text) == "<a href=\"http://google.com\">google.com</a>."
text = "google.com;"
assert parse(text) == "<a href=\"http://google.com\">google.com</a>;"
text = "google.com:"
assert parse(text) == "<a href=\"http://google.com\">google.com</a>:"
text = "hack google.com, please"
assert parse(text) == "hack <a href=\"http://google.com\">google.com</a>, please"
text = "(check out google.com)"
assert parse(text) == "(check out <a href=\"http://google.com\">google.com</a>)"
end
+ test "double dot in link is allowed" do
+ text = "https://example.to/something..mp3"
+ assert parse(text) == "<a href=\"#{text}\">#{text}</a>"
+ end
+
test "do not link urls" do
text = "google.com"
assert parse(text, url: false) == text
end
test "do not link `:test.test`" do
text = ":test.test"
assert parse(text, %{
scheme: true,
extra: true,
class: false,
strip_prefix: false,
new_window: false,
rel: false
}) == text
end
end
def valid_number?([list], number) do
assert List.last(list) == number
end
def valid_number?(_, _), do: false
def valid_scheme_urls,
do: [
"https://www.example.com",
"http://www2.example.com",
"http://home.example-site.com",
"http://blog.example.com",
"http://www.example.com/product",
"http://www.example.com/products?id=1&page=2",
"http://www.example.com#up",
"http://255.255.255.255",
"http://www.site.com:8008"
]
def invalid_scheme_urls,
do: [
"http://invalid.com/perl.cgi?key= | http://web-site.com/cgi-bin/perl.cgi?key1=value1&key2"
]
def valid_non_scheme_urls,
do: [
"www.example.com",
"www2.example.com",
"www.example.com:2000",
"www.example.com?abc=1",
"example.example-site.com",
"example.com",
"example.ca",
"example.tv",
"example.com:999?one=one"
]
def invalid_non_scheme_urls,
do: [
"invalid.com/perl.cgi?key= | web-site.com/cgi-bin/perl.cgi?key1=value1&key2",
"invalid.",
"hi..there",
"555.555.5555",
"255.255.255.255",
"255.255.255.255:3000?one=1&two=2"
]
def custom_tld_scheme_urls,
do: [
"http://whatever.null/",
"https://example.o/index.html",
"http://pleroma.i2p/test",
"http://misskey.loki"
]
def custom_tld_non_scheme_urls,
do: [
"whatever.null/",
"example.o/index.html",
"pleroma.i2p/test",
"misskey.loki"
]
def valid_emails, do: ["rms@ai.mit.edu", "vc@cock.li", "guardian@33y6fjyhs3phzfjj.onion"]
def invalid_emails, do: ["rms[at]ai.mit.edu", "vc@cock"]
def valid_custom_tld_emails, do: ["hi@company.null"]
end
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Wed, Nov 27, 3:45 AM (1 d, 17 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
39780
Default Alt Text
(23 KB)
Attached To
Mode
R19 linkify
Attached
Detach File
Event Timeline
Log In to Comment